# Named Entity Recognition with Python

In [1]:
import pandas as pd

# load data
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

data.head()

Unnamed: 0.1,Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Sentence: 1,Thousands,NNS,O
1,1,Sentence: 1,of,IN,O
2,2,Sentence: 1,demonstrators,NNS,O
3,3,Sentence: 1,have,VBP,O
4,4,Sentence: 1,marched,VBN,O


In [2]:
#Creating a function to make filter the token and tag data
#importing itertools library
from itertools import chain
def make_dict_map(data, tokentag):
    # Define dictionaries to store the mapping between tokens/tags and indices
    token_to_idx = {}
    idx_to_token = {}

    # Check which data to filter (token or tag)
    if tokentag == 'token':
        voc = list(set(data['Word'].to_list()))
    else:
        voc = list(set(data['Tag'].to_list()))

    # Create mappings for both directions: index to token/tag and token/tag to index
    idx_to_token = {idx:tok for  idx, tok in enumerate(voc)}
    token_to_idx = {tok:idx for  idx, tok in enumerate(voc)}

    return token_to_idx , idx_to_token



In [3]:
#Filtering the token and tag using make_dict_map function
token_to_idx, idx_to_token = make_dict_map(data, 'token')
tag_to_idx, idx_to_tag = make_dict_map(data, 'tag')

In [4]:
#mapping the data with token and tag
data['Word_idx'] = data['Word'].map(token_to_idx)
data['Tag_idx'] = data['Tag'].map(tag_to_idx)
#Filling the Nan values in the dataset
data_fillna = data.fillna(method='ffill', axis=0)

In [5]:
# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))


In [6]:
data_group.head(5)

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[15650, 17125, 14468, 31372, 5460, 31866, 2086...","[15, 15, 15, 15, 15, 15, 4, 15, 15, 15, 15, 15..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[8222, 8893, 30383, 31571, 30967, 30415, 6316,...","[5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[22742, 31083, 2479, 16677, 29170, 6603, 28220...","[15, 15, 14, 15, 15, 15, 15, 15, 4, 15, 15, 15..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[7293, 6699, 15495, 1243, 20312, 3816, 23225, ...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[6865, 22555, 1486, 4745, 13328, 27200, 17230,...","[4, 15, 15, 6, 10, 15, 14, 15, 4, 15, 5, 15, 5..."


In [7]:
#Importing train_test_split to split the training and testing data
from sklearn.model_selection import train_test_split
#Importing libraries from keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [8]:

# Split the data into training, testing, and validation sets

def get_train_test_val(data_group, datas):

   # Create a list of token indices and pad to the maximum length 
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    #getting the maximum token length and tag length
    ntoken = len(list(set(datas['Word'].to_list())))
    ntag = len(list(set(datas['Tag'].to_list())))
    
    padtokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= ntoken - 1)
    # Create a list of tag indices, pad to the maximum length, and convert to one-hot encoding
    tags = data_group['Tag_idx'].tolist()
    padtags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag_to_idx["O"])
    ntags = len(tag_to_idx)
    padtags = [to_categorical(i, num_classes=ntags) for i in padtags]
    
    #Splitting the train, test and validation set
    tokens, testtokens, tags, testtags = train_test_split(padtokens, padtags, test_size=0.1, train_size=0.9, random_state=2020)
    traintokens, valtokens, traintags, valtags = train_test_split(tokens,tags,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'length of train tokens :', len(traintokens),
        '\nlength of train tags   :', len(traintags),
        '\nlength of test tokens  :', len(testtokens),
        '\nlength of test tags    :', len(testtags),
        '\nlength of val tokens   :', len(valtokens),
        '\nlength of val tags     :', len(valtags),
    )
    
    # Print length of each set
    return traintokens, testtokens, valtokens, traintags,testtags,valtags



In [9]:
#printing the lengths of train_tokens, test_tokens, val_tokens, train_tags,test_tags,val_tags
traintokens, testtokens, valtokens, traintags,testtags,valtags= get_train_test_val(data_group, data)

length of train tokens : 32372 
length of train tags   : 32372 
length of test tokens  : 4796 
length of test tags    : 4796 
length of val tokens   : 10791 
length of val tags     : 10791


In [10]:
#Importing numpy library and tensorflow.keras library for model building.
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [11]:
#Finding the input and output dimension for Data
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])


In [12]:
#Finding the length of tag_to_idx and saving in ntags variable
ntags = len(tag_to_idx)
ntags

17

In [13]:
# Function for defining the architecture of the model
def get_bilstmlstm():
    # Selecting a Sequential model
    model = Sequential()

    # Adding an Embedding layer to the model
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    
    # Adding a bidirectional LSTM layer to the model
    # The 'Bidirectional' layer runs the LSTM layer in both forward and backward directions, effectively providing information from the past and future context
    # 'units' is the number of LSTM cells to use in the layer
    # 'return_sequences' indicates whether the LSTM layer should return the output for each time step or just the final time step
    # 'dropout' is a regularization method to prevent overfitting in the model
    # 'recurrent_dropout' is a dropout method for the recurrent connections in the LSTM cells
    # 'merge_mode' is the mode used to merge the forward and backward outputs from the LSTM layer
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Adding an LSTM layer to the model
    # This layer is similar to the bidirectional LSTM layer, but only runs in one direction
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Adding a TimeDistributed layer to the model
    # The 'TimeDistributed' layer applies a dense layer to each time step in the input
    model.add(TimeDistributed(Dense(ntags, activation="relu")))

    # Compiling the model with categorical crossentropy loss and the Adam optimizer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Displaying a summary of the model architecture
    model.summary()
    
    return model


I have chosen the Bi-LSTM model because it is a powerful deep learning architecture that can handle sequential data such as text data. This type of model is designed to handle sequences where the order of the data is important, which is the case with text data where the order of words can impact the meaning of a sentence.

Additionally, the Bi-LSTM model is designed to handle bidirectional data, meaning it can learn patterns in the data from both forward and backward sequences. This is important in NLP tasks, where information from both the previous and next words can impact the meaning of a given word.

The LSTM part of the Bi-LSTM model also helps to prevent the vanishing gradient problem, which is a common issue in deep learning models that can cause the model to stop learning over time. The LSTM part of the model can store information from earlier in the sequence, which helps the model to continue learning even when it is processing data that is far away from the start of the sequence.

Overall, the Bi-LSTM model is well-suited for NLP tasks, especially tasks that require the model to handle sequential data and to take into account both forward and backward sequences.

In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

# Initialize the model
model_bilstm = get_bilstmlstm()

# Adding checkpoint to save the best weights
checkpoint = ModelCheckpoint("best_weights.h5", save_best_only=True, verbose=1)

# Adding learning rate reduction
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1)

# Save the architecture of the model
model_bilstm.save("model_architecture.h5")

# Fit the model with checkpoint and reduce_lr callbacks
model_bilstm.fit(traintokens, np.array(traintags), batch_size=1000, verbose=1, epochs=5, 
                 validation_split=0.2, callbacks=[checkpoint, reduce_lr])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 64)           2251456   
                                                                 
 bidirectional (Bidirectiona  (None, 104, 128)         66048     
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed (TimeDistr  (None, 104, 17)          1105      
 ibuted)                                                         
                                                                 
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 1: val_loss improved from inf 

<keras.callbacks.History at 0x16f44505700>

In [18]:
import matplotlib.pyplot as plt
import numpy as np

# Load the best weights from training
model_bilstm.load_weights('best_weights.h5')

# Evaluation on test data
scores = model_bilstm.evaluate(testtokens, np.array(testtags), verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))


Accuracy: 96.80%
