In [None]:
# Project Title : NAMED ENTITY RECOGNITION
# Dataset = NER_dataset

In [None]:
# Data Gathering
import pandas as pd
data = pd.read_csv('/content/ner_dataset.csv', encoding='latin1')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
# Data Preparation
# itertools provides functions for working with sequential data
# chain function is used to combine multiple iterators into a single one
from itertools import chain
def get_dict_map(data, token_or_tag):
    tokens_idx = {} #tokentoindex
    idx_tokens = {} #indextotoken

    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))

    idx_tokens = {idx:tokens for  idx, tokens in enumerate(vocab)}
    tokens_idx = {tokens:idx for  idx, tokens in enumerate(vocab)}
    return tokens_idx, idx_tokens

token_idx, idx_token = get_dict_map(data, 'token')
tag_idx, idx_tag = get_dict_map(data, 'tag')

In [None]:
# Extracting the sequential data
data['Token_idx'] = data['Word'].map(token_idx)
data['Tag_idx'] = data['Tag'].map(tag_idx)

data_fillna = data.fillna(method='ffill', axis=0)

# Grouped & Extracted the sequences
data_group = data_fillna.groupby('Sentence #', as_index=False).agg({
    'Word': list,
    'POS': list,
    'Tag': list,
    'Token_idx': list,
    'Tag_idx': list
})

In [None]:
# Splitting the training & testing data
# [LSTM layers accept sequences of the same length]
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def split_pad_train_test_val(data_group, data):

    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Padding the word sequences
    tokens = data_group['Token_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad & one hot encoding tag sequences
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag_idx["O"])
    n_tags = len(tag_idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    #Splitting the padded token and tag sequences
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = split_pad_train_test_val(data_group, data)

train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [None]:
# Training the Bi-LSTM Model
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [None]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64 # output sapace dimensionality
input_length = max([len(s) for s in data_group['Token_idx'].tolist()])
n_tags = len(tag_idx)

In [None]:
# helper function that will give summary of every layer for NER

def get_bilstm__model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
# helper function that would train the NER model
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()
model_bilstm = get_bilstm__model()
plot_model(model_bilstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 64)           2251456   
                                                                 
 bidirectional (Bidirection  (None, 104, 128)          66048     
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed (TimeDist  (None, 104, 17)           1105      
 ributed)                                                        
                                                                 
Total params: 2368017 (9.03 MB)
Trainable params: 2368017 (9.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Testing the model using LSTM

def predict_entities(model, tokens):
    # Predict tags for tokens
    predicted_tags = model.predict(tokens)
    # Decode predicted tags into named entities
    decoded_tags = []
    for sentence_tags in predicted_tags:
        decoded_tags.append([idx_tag[np.argmax(tag)] for tag in sentence_tags])
    return decoded_tags


input_sentence = "My name is Simar Katyal.\nI work at Google.\nI drive Mercedes, G-Wagon."
# Tokenization
input_tokens = [[token_idx.get(word, 0) for word in input_sentence.split()]]
# Pad input tokens
input_tokens_padded = pad_sequences(input_tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token - 1)
# Predict named entity tags
predicted_tags = predict_entities(model_bilstm, input_tokens_padded)

In [None]:
# Decode named entities from predicted tags
def decode_named_entities(sentence, tags):
    named_entities = []
    current_entity = None
    for word, tag in zip(sentence.split(), tags):
        if tag.startswith('B-'):
            if current_entity:
                named_entities.append(current_entity)
            current_entity = {'entity': tag[2:], 'words': [word]}
        elif tag.startswith('I-'):
            if current_entity:
                current_entity['words'].append(word)
        else:
            if current_entity:
                named_entities.append(current_entity)
                current_entity = None
    if current_entity:
        named_entities.append(current_entity)
    return named_entities

# Decode named entities from predicted tags
named_entities = decode_named_entities(input_sentence, predicted_tags)
# Print decoded named entities
for entity in named_entities:
    print(f"{entity['entity']}: {' '.join(entity['words'])}")

In [None]:
# Testing the model using Spacy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, I am Simar Katyal.\nI want to work at Google or Microsoft.\nI want to drive Mercedes G-Wagon.')
displacy.render(text, style = 'ent', jupyter=True)

In [None]:
# Performance Evaluation
from sklearn.metrics import classification_report

def evaluate_model(model, tokens, true_tags):
    # Predict tags for tokens
    predicted_tags = model.predict(tokens)
    # Flatten true and predicted tags
    true_tags_flat = np.argmax(true_tags, axis=-1).flatten()
    predicted_tags_flat = np.argmax(predicted_tags, axis=-1).flatten()
    # Ignore padding
    mask = true_tags_flat != tag_idx['O']
    true_tags_flat = true_tags_flat[mask]
    predicted_tags_flat = predicted_tags_flat[mask]
    # Calculate evaluation metrics
    report = classification_report(true_tags_flat, predicted_tags_flat, target_names=list(tag_idx.keys())[1:], output_dict=True)
    return report

# Evaluate the model
evaluation_report = evaluate_model(model_bilstm, val_tokens, np.array(val_tags))
print("Evaluation Report:")
print(evaluation_report)