<a href="https://colab.research.google.com/github/Somani-Harsh/data-science-python/blob/master/Entity_Detection_on_ATIS_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install tf2crf

Collecting tf2crf
  Downloading https://files.pythonhosted.org/packages/15/13/c2a0aca2107d932a10447920aa4c8e61f8a6111b47119fda5c2d0ccc7131/tf2crf-0.1.19-py2.py3-none-any.whl
Installing collected packages: tf2crf
Successfully installed tf2crf-0.1.19


In [17]:
import pandas as pd
import numpy as np
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
import tensorflow as tf
from tf2crf import CRF
import tensorflow.keras.backend as K
import tqdm

In [3]:
!wget https://github.com/allenai/spv2/raw/master/model/glove.6B.100d.txt.gz
!gzip -d glove.6B.100d.txt.gz 

--2020-09-26 11:48:41--  https://github.com/allenai/spv2/raw/master/model/glove.6B.100d.txt.gz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/allenai/spv2/master/model/glove.6B.100d.txt.gz [following]
--2020-09-26 11:48:42--  https://media.githubusercontent.com/media/allenai/spv2/master/model/glove.6B.100d.txt.gz
Resolving media.githubusercontent.com (media.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 134409071 (128M) [application/octet-stream]
Saving to: ‘glove.6B.100d.txt.gz’


2020-09-26 11:48:48 (96.2 MB/s) - ‘glove.6B.100d.txt.gz’ saved [134409071/134409071]



In [15]:
embeddings_index = {}
f = open("glove.6B.100d.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype="float32")
  embeddings_index[word] = coefs
f.close()

print("Found %s word vectors"%(len(embeddings_index)))

Found 400000 word vectors


In [18]:
def read_data(filename):
    df = pd.read_csv(filename, header=None)
    df["text"] = df[0].str.split("\t").apply(lambda x: x[0])
    df["labels"] = df[0].str.split("\t").apply(lambda x: x[1].split())
    df = df.drop(0, axis=1)
    df["intent"] = df["labels"].apply(lambda x: x.pop())
    df["labels"] = df["labels"].apply(lambda x: x + ["O"])
    return df

In [20]:
DATA_PATH = "drive/My Drive/interviews/Miko/"

In [21]:
files = glob.glob(DATA_PATH + "*")
files

['drive/My Drive/interviews/Miko/atis.test.w-intent.iob (2) (2) (2).txt',
 'drive/My Drive/interviews/Miko/atis-2.train.w-intent.iob (3) (2) (2).txt',
 'drive/My Drive/interviews/Miko/ner_model_weights.pickle.index',
 'drive/My Drive/interviews/Miko/ner_model_weights.pickle.data-00000-of-00001',
 'drive/My Drive/interviews/Miko/checkpoint']

In [22]:
train_df = read_data(files[1])
test_df = read_data(files[0])

In [23]:
train_df["text"].str.split().str.len().max()

48

In [24]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="OOV")
tokenizer.fit_on_texts(train_df["text"].tolist())

In [27]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in tqdm.tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

100%|██████████| 870/870 [00:00<00:00, 239015.16it/s]


In [28]:
labels = list(set(sum(train_df["labels"].tolist(), []))) + list(set(sum(test_df["labels"].tolist(), [])))

le = LabelEncoder()
le.fit(labels + ["PAD"])

LabelEncoder()

In [29]:
n_classes = len(le.classes_)
n_classes

127

In [30]:
def prepare_model_input(texts, labels):
  # text
  seq = tokenizer.texts_to_sequences(texts)
  seq = tf.keras.preprocessing.sequence.pad_sequences(seq, 48)

  # labels
  labels = list(map(le.transform, labels))
  labels = tf.keras.preprocessing.sequence.pad_sequences(labels, 48, value=n_classes-1)
  return seq, labels

In [31]:
train_seq, train_labels = prepare_model_input(train_df["text"].tolist(), labels=train_df["labels"].tolist())

In [32]:
test_seq, test_labels = prepare_model_input(test_df["text"].tolist(), labels=test_df["labels"].tolist())

In [33]:
train_seq.shape, train_labels.shape

((4478, 48), (4478, 48))

In [56]:
train_seq.shape, train_labels.shape

((4478, 48), (4478, 48))

In [57]:
y1 = set(sum(train_df["labels"].tolist(), []))
y2 = set(sum(test_df["labels"].tolist(), []))
y2.difference(y1)

{'B-booking_class',
 'B-compartment',
 'B-flight',
 'B-stoploc.airport_code',
 'I-flight_number',
 'I-state_name'}

In [38]:
y = sum(np.array(list(map(le.inverse_transform, train_labels))).tolist() + [list(y2.difference(y1))], []) 

In [39]:
class_weights = class_weight.compute_class_weight("balanced", le.classes_, y)
class_weights = dict(enumerate(class_weights))
class_weights[126] = 0

Model Architecture

In [66]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, 
                                    output_dim=100, input_length=48,
                                    weights = [embedding_matrix],
                                    trainable=False))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(100, return_sequences=True, recurrent_dropout=0.2)))
model.add(tf.keras.layers.BatchNormalization())
crf = CRF(sparse_target=True, dtype='float32')
model.add(crf)
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 48, 100)           87100     
_________________________________________________________________
dropout_7 (Dropout)          (None, 48, 100)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 48, 200)           160800    
_________________________________________________________________
batch_normalization_4 (Batch (None, 48, 200)           800       
_________________________________________________________________
crf_4 (CRF)                  (None, 48)                40000     
Total params: 288,700
Trainable params: 201,200
Non-trainable params: 87,500
_________________________________________________________________


In [67]:
model.compile(optimizer="Adam", loss=crf.loss, metrics=[crf.accuracy])

In [68]:
with tf.device("/device:GPU:0"):
  model_hist = model.fit(train_seq, train_labels,
                         batch_size=32, epochs=10, 
                         validation_split=0.2, shuffle=True,
                         class_weight = class_weights
                         )


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
model.save_weights("drive/My Drive/interviews/Miko/ner_model_weights.pickle")

**Train Data Classification Report**

In [70]:
train_pred = model.predict(train_seq).astype(int)

In [71]:
print(classification_report(sum(train_labels.tolist(),[]), 
                            sum(train_pred.tolist(), []), 
                            labels=range(0,125),
                            target_names=le.classes_.tolist()
                            ))

  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                              precision    recall  f1-score   support

             B-aircraft_code       0.79      0.87      0.83        30
              B-airline_code       0.95      0.83      0.89       127
              B-airline_name       0.97      0.98      0.97       639
              B-airport_code       1.00      0.80      0.89        25
              B-airport_name       0.67      0.18      0.28        34
 B-arrive_date.date_relative       1.00      0.44      0.62         9
      B-arrive_date.day_name       1.00      0.09      0.16        78
    B-arrive_date.day_number       1.00      0.60      0.75        43
    B-arrive_date.month_name       1.00      0.60      0.75        43
B-arrive_date.today_relative       0.00      0.00      0.00         1
      B-arrive_time.end_time       1.00      0.41      0.58        17
    B-arrive_time.period_mod       0.00      0.00      0.00         3
 B-arrive_time.period_of_day       0.50      0.02      0.04        51
    B-arrive_time.s

**Test Data Classfication Report**

In [72]:
test_pred = model.predict(test_seq)

In [73]:
print(classification_report(sum(test_labels.tolist(),[]), 
                            sum(test_pred.tolist(), []), 
                            labels=range(0,125),
                            target_names=le.classes_.tolist()
                            ))

                              precision    recall  f1-score   support

             B-aircraft_code       1.00      0.52      0.68        33
              B-airline_code       1.00      0.53      0.69        34
              B-airline_name       0.90      0.94      0.92       101
              B-airport_code       0.00      0.00      0.00         9
              B-airport_name       0.00      0.00      0.00        21
 B-arrive_date.date_relative       0.00      0.00      0.00         2
      B-arrive_date.day_name       0.00      0.00      0.00        11
    B-arrive_date.day_number       1.00      0.17      0.29         6
    B-arrive_date.month_name       1.00      0.17      0.29         6
B-arrive_date.today_relative       0.00      0.00      0.00         0
      B-arrive_time.end_time       0.00      0.00      0.00         8
    B-arrive_time.period_mod       0.00      0.00      0.00         0
 B-arrive_time.period_of_day       0.00      0.00      0.00         6
    B-arrive_time.s

  .format(len(labels), len(target_names))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
