In [1]:
!pip install pyyaml h5py  # Required to save models in HDF5 format
!mkdir -p saved_model



In [2]:
from IPython.display import HTML, display
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow
from tensorflow import keras
from tensorflow.data import Dataset
from tensorflow.keras import layers, models
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from ast import literal_eval
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint

In [4]:
data = pd.read_csv("/content/ner.csv")
data_tags = pd.read_csv("/content/ner_dataset.csv",  encoding= 'unicode_escape')
data_tags.fillna(method = 'ffill', inplace = True)
data.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [5]:
unique_tags = data_tags.Tag.unique()
unique_tags

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [6]:
X = list(data['Sentence'])
Y = list(data['Tag'])
Y_tags = []

for sen_tags in Y:
  Y_tags.append(literal_eval(sen_tags))

print(X[:2], "\n")
print(Y_tags[:2], "\n")
print("# Examples:", (len(X)))

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "'] 

[['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] 

# Examples: 47959


In [7]:
max_len = 110
max_words = 36000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index
print("Unique Tokens: ", (len(word_index)), "\n")

ind_to_word = dict([(value, key) for (key, value) in word_index.items()])

word_to_id = word_index
id_to_word = {}
for key, value in word_to_id.items():
  id_to_word[value] = key

Unique Tokens:  27953 



In [9]:
X_preprocessed = pad_sequences(sequences, maxlen=max_len, padding='post')
print(X_preprocessed[0], "\n")
print(X_preprocessed[15000], "\n")

[ 260    3  997   13 1838  245  452    4  545    1  121    2   60    6
  595    1  861    3  184   89   21   12   54    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0] 

[    5   276     3 12747   585 12736     4     1   414   275     1   415
   392   563   106     6   846    40   486   964     2   492   754  6928
     4     1  2412     3    24   608  5756   105   737   628     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 

In [10]:
tags_to_id = {}
for i, tag in enumerate(unique_tags):
  tags_to_id[tag] = i

print(tags_to_id, "\n")

id_to_tag = {}
for key, value in tags_to_id.items():
  id_to_tag[value] = key

print(id_to_tag, "\n")

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16} 

{0: 'O', 1: 'B-geo', 2: 'B-gpe', 3: 'B-per', 4: 'I-geo', 5: 'B-org', 6: 'I-org', 7: 'B-tim', 8: 'B-art', 9: 'I-art', 10: 'I-per', 11: 'I-gpe', 12: 'I-tim', 13: 'B-nat', 14: 'B-eve', 15: 'I-eve', 16: 'I-nat'} 



In [11]:
def preprocess_tags(tags_to_id, Y_tags):
    Y_preprocessed = []
    maxlen = 110

    for y in Y_tags:
      Y_place_holder = []
      
      for tag in y:
          Y_place_holder.append(tags_to_id[tag])
      
      len_new_tag_list = len(Y_place_holder)
      num_O_to_add = maxlen - len_new_tag_list
      
      padded_tags = Y_place_holder + ([tags_to_id['O']] * num_O_to_add)
      Y_preprocessed.append(padded_tags)
        
    return Y_preprocessed

In [12]:
Y_preprocessed = preprocess_tags(tags_to_id, Y_tags)
print(Y_preprocessed[1], "\n")
print(Y_tags[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [14]:
print("Training examples length == training targets length?", (len(X_preprocessed))==(len(Y_preprocessed)), "\n")

X_preprocessed = np.asarray(X_preprocessed)
Y_preprocessed = np.asarray(Y_preprocessed)
training_samples = 0.7
testing_samples = 0.15
validation_samples = 0.15
indices = np.arange(len(Y_preprocessed))
np.random.seed(seed=500)
np.random.shuffle(indices)

X_preprocessed = X_preprocessed[indices]
Y_preprocessed = Y_preprocessed[indices]

X_train = X_preprocessed[: int(0.7 * len(X_preprocessed))]
X_test = X_preprocessed[int(0.7 * len(X_preprocessed)) + (int(0.15 * len(X_preprocessed)) + 1) : ]
X_val = X_preprocessed[int(0.7 * len(X_preprocessed)) : int(0.7 * len(X_preprocessed)) + (int(0.15 * len(X_preprocessed)) + 1)]

Y_train = Y_preprocessed[: int(0.7 * len(X_preprocessed))]
Y_test = Y_preprocessed[int(0.7 * len(X_preprocessed)) + (int(0.15 * len(X_preprocessed)) + 1) : ]
Y_val = Y_preprocessed[int(0.7 * len(X_preprocessed)) : int(0.7 * len(X_preprocessed)) + (int(0.15 * len(X_preprocessed)) + 1)]

print("\n# training examples: ", (len(X_train)), "\n")
print("# testing examples: ", (len(X_test)), "\n")
print("# validation examples: ", (len(X_val)), "\n")
print("Total examples: ", (len(X_train) + len(X_val) + len(X_test)))

Training examples length == training targets length? True 


# training examples:  33571 

# testing examples:  7194 

# validation examples:  7194 

Total examples:  47959


In [15]:
print(X_train[50], "\n")
print(Y_train[50], "\n")
print(id_to_word[150], "\n")

[  31  122    9  841    4 3404  251  261 1017  649   12 1084    1 1022
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0] 

[ 3 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0] 

news 



In [47]:
train_dataset = Dataset.from_tensor_slices((X_train, Y_train))
test_dataset = Dataset.from_tensor_slices((X_test, Y_test))
val_dataset = Dataset.from_tensor_slices((X_val, Y_val))

batch_size = 132
shuffle_size = 132

train_dataset = train_dataset.shuffle(shuffle_size).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [18]:
embedding_dimensions = 300
num_tags = len(unique_tags)

model = models.Sequential([
    layers.Embedding(max_words, embedding_dimensions, input_length=max_len),
    layers.Bidirectional(layers.LSTM(units=100, activation='relu', return_sequences=True)),
    layers.Bidirectional(layers.LSTM(units=100, activation='tanh', return_sequences=True)),
    layers.TimeDistributed(layers.Dense(num_tags, activation='softmax'))
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 110, 300)          10800000  
                                                                 
 bidirectional (Bidirectiona  (None, 110, 200)         320800    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 110, 200)         240800    
 nal)                                                            
                                                                 
 time_distributed (TimeDistr  (None, 110, 17)          3417      
 ibuted)                                                         
                                                                 
Total params: 11,365,017
Trainable params: 11,365,017
Non-trainable params: 0
____________________________________________

In [20]:
cp_callback = ModelCheckpoint(filepath="/content/saved_model/", verbose=1, save_weights_only=True, save_freq=batch_size)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_dataset, validation_data=val_dataset, epochs=3, callbacks=[cp_callback])

Epoch 1/3
Epoch 1: saving model to /content/saved_model/
Epoch 2/3
  8/255 [..............................] - ETA: 7:28 - loss: 0.1070 - accuracy: 0.9722
Epoch 2: saving model to /content/saved_model/
Epoch 2: saving model to /content/saved_model/
Epoch 3/3
 17/255 [=>............................] - ETA: 7:20 - loss: 0.0740 - accuracy: 0.9771
Epoch 3: saving model to /content/saved_model/
Epoch 3: saving model to /content/saved_model/


In [21]:
model.save('/content/saved_model/my_model') 



INFO:tensorflow:Assets written to: /content/saved_model/my_model/assets


INFO:tensorflow:Assets written to: /content/saved_model/my_model/assets


In [22]:
model.evaluate(test_dataset)



[0.07217750698328018, 0.9781749844551086]