In [29]:
import pandas as pd

In [30]:
#Saving the dataset into a dataframe
data = pd.read_csv('/kaggle/input/entity-annotated-corpus/ner_dataset.csv', encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [31]:
#Creating a dictionary for listing the tokens and tags with their respective indices
token2id = {tok:id for  id, tok in enumerate(list(set(data['Word'])))}
tag2id = {tag:id for  id, tag in enumerate(list(set(data['Tag'])))}


In [32]:
#Mapping words to their id in the dataframe for every occurence
data['Word_id'] = data['Word'].map(token2id)

In [33]:
#Mapping tokens to their id in the dataframe for every occurence
data['Tag_id'] = data['Tag'].map(tag2id)

In [34]:
#Extracting list and length of unique word tokens in the vocabulary
words = list(set(data['Word'].values))
num_words = len(words)

In [35]:
#Extracting list and length of unique tags in the vocabulary
tags = list(set(data['Tag'].values))#iob2 tags 
num_tags = len(tags)

In [36]:
#Creating enumerating objects for words and tags
word2id = {w: i for i, w in enumerate(words)}
print (word2id)

tag2id = {t: i for i, t in enumerate(tags)}



In [37]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_id,Tag_id
0,Sentence: 1,Thousands,NNS,O,9547,0
1,,of,IN,O,29854,0
2,,demonstrators,NNS,O,29235,0
3,,have,VBP,O,25918,0
4,,marched,VBN,O,15834,0


In [38]:
#Checking for null values
data.isna().any()

Sentence #     True
Word          False
POS           False
Tag           False
Word_id       False
Tag_id        False
dtype: bool

In [39]:
#Filling the null values with previous value(sentence number)
data_filled = data.fillna(method='ffill', axis=0)

#Grouping data by the column 'Sentence #'
data_grouped = data_filled.groupby(['Sentence #'],as_index=False)

#Applying a function to ensure that every row contains values of an entire sentence through aggregating over all column values
data_grouped=data_grouped['Word', 'POS', 'Tag', 'Word_id', 'Tag_id'].agg(lambda x: list(x))

data_grouped.head()


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Sentence #,Word,POS,Tag,Word_id,Tag_id
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[9547, 29854, 29235, 25918, 15834, 15389, 2434...","[0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 15, 0, 0..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[12601, 28457, 2599, 6832, 27087, 25675, 30917...","[13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[30306, 1583, 4505, 34872, 34940, 13976, 3975,...","[0, 0, 3, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 6,..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[7668, 5123, 4220, 1467, 25795, 14707, 19277, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[27274, 33074, 32745, 11025, 28842, 7870, 1543...","[15, 0, 0, 2, 7, 0, 3, 0, 15, 0, 13, 0, 13, 0,..."


In [40]:
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [41]:
def get_pad_train_test_val(data_grouped, data):
    n_token = len(list(set(data['Word'])))
    n_tags = len(tag2id)
    
    tokens = data_grouped['Word_id'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
    
    tags = data_grouped['Tag_id'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2id["O"])
    
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    
    train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntrain_tags:', len(train_tags),
        '\ntest_tags:', len(test_tags)
    )
    
    return train_tokens, test_tokens, train_tags, test_tags

train_tokens, test_tokens, train_tags, test_tags = get_pad_train_test_val(data_grouped, data)

train_tokens length: 43163 

test_tokens length: 4796 

train_tags: 43163 

test_tags: 4796


In [42]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional


In [44]:
#Seeding to ensure reproducibility of results
from numpy.random import seed
seed(1)

#TensorFlow has a random number generator that should also be seeded
tensorflow.random.set_seed(2)

In [45]:
input_dim = len(list(set(data['Word'])))+1
output_dim = 32
input_length = max([len(s) for s in data_grouped['Word_id'].tolist()])
n_tags = len(tag2id)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  35179 

output_dim:  32 

input_length:  104 

n_tags:  17


In [46]:
#Function defining the LSTM neural network model
def get_model():
    model = Sequential()

    # Adding the Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Adding bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2)))
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.2))

    # Adding the timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

   
    # Compiling the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [47]:
#Early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=1, verbose=1, mode='max', restore_best_weights=False)
callbacks = [early_stopping]
model = get_model()
history = model.fit(
   train_tokens, np.array(train_tags),
    validation_split=0.2,
    batch_size = 64,
    epochs = 3,
    verbose = 1,
    callbacks = callbacks)

Model: "sequential_2"

_________________________________________________________________

Layer (type)                 Output Shape              Param #   


embedding_2 (Embedding)      (None, 104, 32)           1125728   

_________________________________________________________________

bidirectional_2 (Bidirection (None, 104, 64)           16640     

_________________________________________________________________

lstm_5 (LSTM)                (None, 104, 32)           12416     

_________________________________________________________________

time_distributed_2 (TimeDist (None, 104, 17)           561       


Total params: 1,155,345

Trainable params: 1,155,345

Non-trainable params: 0

_________________________________________________________________

Epoch 1/3


Epoch 2/3


Epoch 00002: early stopping


In [48]:
#Testing the baseline model
test_tags=np.array(test_tags)
results = model.evaluate(test_tokens, test_tags, return_dict=True)
results_df=pd.DataFrame([results.values()], columns=[list(results.keys())])

results_df.index = ['Baseline']

results_df.head()




Unnamed: 0,loss,accuracy
Baseline,0.25632,0.967303


In [49]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy


In [50]:
#Defining the hypermodel
def build_model(hp):
   
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    # Tuning the number of units in each hidden layer (Range of number of Units: 32 - 512 with stepsize: 32)
    global outputdim
    outputdim=hp.Int("units" , min_value=32, max_value=512, step=32)
    model.add(
           Bidirectional(LSTM(
                units=outputdim, return_sequences=True, dropout=0.2,
               )))
    
    model.add(LSTM(units=outputdim, return_sequences=True, dropout=0.2))

    #Output layer.
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    # Tuning learning rate for Adam optimizer (values: 0.01, 0.001, or 0.0001)
    hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    
    # Compiling the model
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                  loss='categorical_crossentropy',
                  metrics=["accuracy"])
    
    return model

In [51]:
!pip install keras_tuner
import keras_tuner as kt
tuner = kt.BayesianOptimization(build_model,
                     objective="val_accuracy",
                     max_trials=5,
                     num_initial_points=2,
                     seed=42,
                     hyperparameters=None,
                     tune_new_entries=True,
                     allow_new_entries=True,
                     )















































[0m

In [52]:
#Print the values that are being searched
tuner.search_space_summary()

Search space summary

Default search space size: 2

units (Int)

{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': None}

learning_rate (Choice)

{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [53]:
train_tags=np.array(train_tags)
stop_early = tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

tuner.search(train_tokens, train_tags, epochs=20, validation_split=0.2, callbacks=[stop_early], verbose=2)


In [54]:
best_hps=tuner.get_best_hyperparameters()[0]

h_model = tuner.hypermodel.build(best_hps)

# Model training
h_model.fit(train_tokens, train_tags, epochs=3, validation_split=0.2, callbacks=[stop_early], verbose=2)


Epoch 1/3

1080/1080 - 54s - loss: 0.3063 - accuracy: 0.9631 - val_loss: 0.3345 - val_accuracy: 0.9679

Epoch 2/3

1080/1080 - 48s - loss: 0.2221 - accuracy: 0.9677 - val_loss: 0.1896 - val_accuracy: 0.9679

Epoch 3/3

1080/1080 - 48s - loss: 0.1912 - accuracy: 0.9677 - val_loss: 0.1860 - val_accuracy: 0.9679


<keras.callbacks.History at 0x7f790d73f850>

In [55]:
#Evaluate test set using h_model
hyper = h_model.evaluate(test_tokens, test_tags, return_dict=True)

#Append the results to the results_df dataframe
hyper_df=pd.DataFrame([hyper.values()], columns=[list(hyper.keys())])
hyper_df.index = ['Model 2']

results_df.append(hyper_df)



Unnamed: 0,loss,accuracy
Baseline,0.25632,0.967303
Model 2,0.184362,0.967976
