<a href="https://www.kaggle.com/code/shinnurathod/ner-named-entiry-reco?scriptVersionId=209649268" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import matplotlib.pyplot ast plot
%matplotlib inline

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical  

import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
import pandas as pd  # Make sure pandas is imported

# Setting seeds for reproducibility
seed(1)
tensorflow.random.set_seed(2)

In [1]:
data = pd.read_csv('/kaggle/input/ner-dataset/ner_datasetreference.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


#### Data Preparation for Neural Networks extracting the mappings that are required to train the neural network

In [2]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [3]:
from itertools import islice
first_few = islice(token2idx.items(), 10)     # Get first few items

# Print first_few key-value pairs
for key, value in first_few:
    print(f"{key}: {value}")

depression: 0
Jean-Paul: 1
Anura: 2
dainty: 3
gridlock: 4
Militants: 5
impropriety: 6
dressed: 7
Valentina: 8
Nimal: 9


In [4]:
first_few = islice(idx2token.items(), 10)
for key, value in first_few:
    print(f'{key}: {value}')

0: depression
1: Jean-Paul
2: Anura
3: dainty
4: gridlock
5: Militants
6: impropriety
7: dressed
8: Valentina
9: Nimal


In [5]:
first_few = islice(tag2idx.items(), 10)
for key, value in first_few:
    print(f'{key}: {value}')

I-org: 0
I-nat: 1
I-gpe: 2
B-nat: 3
B-org: 4
I-per: 5
B-per: 6
B-art: 7
B-geo: 8
I-eve: 9


In [6]:
first_few = islice(idx2tag.items(), 10)
for key, value in first_few:
    print(f'{key}: {value}')

0: I-org
1: I-nat
2: I-gpe
3: B-nat
4: B-org
5: I-per
6: B-per
7: B-art
8: B-geo
9: I-eve


In [7]:
##### transform the columns in the data to extract the sequential data for our neural network:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)

data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'],as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))

  data_fillna = data.fillna(method='ffill', axis=0)


In [8]:
data_group

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[31072, 18182, 9987, 6877, 17988, 11680, 1220,...","[12, 12, 12, 12, 12, 12, 8, 12, 12, 12, 12, 12..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[14169, 27421, 23223, 26885, 30146, 6844, 6406...","[11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[21277, 19525, 8509, 30274, 11021, 22569, 2905...","[12, 12, 10, 12, 12, 12, 12, 12, 8, 12, 12, 12..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[3511, 30870, 13493, 33424, 11975, 15454, 2899...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[28621, 18991, 12876, 7997, 21957, 5668, 19754...","[8, 12, 12, 6, 5, 12, 10, 12, 8, 12, 11, 12, 1..."
...,...,...,...,...,...,...
47954,Sentence: 9995,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[NNP, NN, NNP, NNP, NNP, VBZ, VBN, PRP, VBZ, T...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O...","[2499, 3839, 15063, 32870, 17588, 2830, 5668, ...","[12, 12, 12, 6, 5, 12, 12, 12, 12, 12, 12, 12,..."
47955,Sentence: 9996,"[On, Thursday, ,, Iranian, state, media, publi...","[IN, NNP, ,, JJ, NN, NNS, VBN, DT, NN, IN, DT,...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B...","[18555, 20040, 30981, 14169, 8386, 2620, 13639...","[12, 10, 12, 11, 12, 12, 12, 12, 12, 12, 12, 1..."
47956,Sentence: 9997,"[Following, Iran, 's, disputed, June, 12, elec...","[VBG, NNP, POS, JJ, NNP, CD, NNS, ,, NNS, NNS,...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ...","[2220, 32257, 5986, 30219, 10009, 5181, 29333,...","[12, 8, 12, 12, 10, 13, 12, 12, 12, 12, 12, 12..."
47957,Sentence: 9998,"[Since, then, ,, authorities, have, held, publ...","[IN, RB, ,, NNS, VBP, VBN, JJ, NNS, IN, DT, VB...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[17382, 21222, 30981, 1026, 6877, 27867, 28866...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."


#### split the data into training and test sets LSTM layers accept sequences of the same length only
* every sentence that appears as integer in the data must be padded with the same length:

In [13]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntrain_tags:', len(train_tags),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags
# call function
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)    

train_tokens length: 32372 
test_tokens length: 4796 
train_tags: 32372 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


# Training Neural Network for Named Entity Recognition (NER)

In [18]:
# Parameters
input_dim = len(list(set(data['Word'].to_list()))) + 1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [19]:
####### our custom model architecture
def get_bilstm_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))     # Add Embedding layer
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'))      # Add bidirectional LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))      # Add LSTM
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))      # Add TimeDistributed Layer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])      # Compile model
    return model

In [20]:
# Helper function to train the Named Entity Recognition model
def train_model(X, y, model):
    loss = []
    for i in range(25):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)   # Fit model for one epoch on this sequence
        loss.append(hist.history['loss'][0])
    return loss

In [21]:
# Driver code
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
model_bilstm_lstm.build(input_shape=(None, input_length))   ## Build the model explicitly by specifying input shape
plot_model(model_bilstm_lstm, show_shapes=True, show_layer_names=True)  # Now plot the model
# plot_model(model_bilstm_lstm, show_shapes=True, show_layer_names=True, to_file='model_plot.png')

results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm) # Train the model



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 461ms/step - accuracy: 0.8132 - loss: 1.9071 - val_accuracy: 0.9681 - val_loss: 0.3802
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 403ms/step - accuracy: 0.9675 - loss: 0.4090 - val_accuracy: 0.9681 - val_loss: 0.3666
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 404ms/step - accuracy: 0.9676 - loss: 0.3606 - val_accuracy: 0.9682 - val_loss: 0.2876
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 413ms/step - accuracy: 0.9677 - loss: 0.2984 - val_accuracy: 0.9682 - val_loss: 0.2306
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 406ms/step - accuracy: 0.9678 - loss: 0.2682 - val_accuracy: 0.9685 - val_loss: 0.2393
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 404ms/step - accuracy: 0.9678 - loss: 0.2587 - val_accuracy: 0.9682 - val_loss: 0.2206
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 411ms/step - accuracy: 

# Testing the Named Entity Recognition (NER) Model:

In [1]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Shinu \n I am from India \n I want to work with Google \n Steve Jobs is My Inspiration')
displacy.render(text, style = 'ent', jupyter=True)

In [4]:
displacy.render(text, style = 'dep', jupyter=True, options = {'distance': 120, 'bg': 'green'}) # dependency sparse tree

In [39]:
from tensorflow.keras.models import save_model, load_model
# Save the model and mappings
model_bilstm_lstm.save('ner_model.h5')
with open('token_mappings.pkl', 'wb') as file:
    pickle.dump({'token2idx': token2idx, 'idx2tag': idx2tag}, file)

# Load the model and mappings
with open('token_mappings.pkl', 'rb') as file:
    mappings = pickle.load(file)

token2idx = mappings['token2idx']
idx2tag = mappings['idx2tag']
loaded_model = load_model('ner_model.h5')

In [40]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_text(text, token2idx, maxlen):
    # Tokenize the text
    tokens = text.split()  # Split the text into words
    
    # Map tokens to indices; use 'UNKNOWN' if a token is not in the vocabulary
    token_indices = [token2idx.get(token, token2idx['UNKNOWN']) for token in tokens]

    # Pad the sequence
    padded_tokens = pad_sequences([token_indices], maxlen=maxlen, padding='post', value=token2idx['UNKNOWN'])
    
    return tokens, padded_tokens


In [43]:
def test_ner_model(text, model, token2idx, idx2tag, maxlen):
    # Tokenize the input text
    tokens = text.split()
    
    # Dynamically add new tokens to token2idx with a unique index
    for token in tokens:
        if token not in token2idx:
            token2idx[token] = len(token2idx) + 1  # Assign a new index
    
    # Map tokens to indices
    token_indices = [token2idx[token] for token in tokens]

    # Pad the sequence
    padded_tokens = pad_sequences([token_indices], maxlen=maxlen, padding='post', value=0)  # Use 0 for padding

    # Predict tags using the model
    predictions = model.predict(padded_tokens)

    # Get the predicted tags
    tag_indices = predictions.argmax(axis=-1)[0]
    predicted_tags = [idx2tag[idx] for idx in tag_indices]

    # Return tokens with predicted tags
    return list(zip(tokens, predicted_tags))

In [59]:
# Example new sample text
sample_text = "hi my name is Shinu Rathod INC, testla,india country USA, "

# Maximum input length used during training
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])  # Or use the saved length

# Get predictions
results = test_ner_model(sample_text, loaded_model, token2idx, idx2tag, input_length)

# Print the results
for token, tag in results:
    print(f"{token}: {tag}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
hi: O
my: O
name: O
is: O
Shinu: O
Rathod: O
INC,: O
testla,india: O
country: O
USA,: O
