<a href="https://www.kaggle.com/code/abhijitbhandari/lstm-based-named-entity-recognition?scriptVersionId=143374564" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# !pip install chardet

In [None]:
# import chardet
# with open("./ner_dataset.csv", "rb") as f:
#     byte_data = f.read(100000) #Read the first 1024 bytes



In [None]:
# result = chardet.detect(byte_data)
# encoding = result["encoding"]
##encoding is 'Windows-1252'

## Reading the csv file

In [None]:
df = pd.read_csv("/kaggle/input/entity-annotated-corpus/ner_dataset.csv", encoding= "Windows-1252")
df.head()

In [None]:
df["Sentence #"].unique()

In [None]:
plt.figure(figsize = (4,3))
df.isnull().sum().plot(kind = "bar")
plt.xlabel("Columns")
plt.ylabel("Total Null Values")

In [None]:
df.fillna(method = "ffill", axis = 0, inplace = True)

In [None]:
df["Sentence #"].unique()

## Understanding the Data

In [None]:
df.head(10)

In [None]:
## Removing any row with number
df = df[~df["Word"].str.contains(r"\d+(\.\d+)?")]
df.reset_index(drop = True, inplace = True)

In [None]:
plt.figure(figsize = (7,5))
ax = df.nunique().plot(kind = "bar")
plt.xlabel("Column Name")
plt.ylabel("Total Number of Unique Items")
plt.xticks(rotation = 45)
plt.tight_layout()

for i, v in enumerate(df.nunique()):
    ax.text(i, v+0.2, str(v), ha = "center", va = "bottom")

plt.show()

So, there are 42 parts of speech and 17 tags

In [None]:
## Understanding the distribution of tags
df.groupby(["Tag"])["Word"].count().plot(kind = "bar")

The tag classes are highly imbalanced

## Data Wrangling

In [None]:
words = list(set(df["Word"].values))
words[:5]

In [None]:
words = [x.lower() for x in words]
words[:5]

In [None]:
## Append eos (end of sentence)
words.append("eos")

In [None]:
vocab_size = len(words)
vocab_size

In [None]:
tags = list(set(df["Tag"].values))
tags = [t.lower() for t in tags]
tags[:5]

In [None]:
num_tags = len(tags)

**Now, we try to group the data as (word, pos, tag) for every sentence.**

In [None]:
df.columns

In [None]:
class sentenceinfo(object):
    """
    This is a function used to arrange arrange the data as [[(word, pos, tag)],.....] for every sentence
    present in the dataframe
    """
    def __init__(self, data:pd.DataFrame):
        assert all(col for col in data.columns if col in ["Sentence #", "Word", "POS", "Tag"]), "Check the column or change the column names as Sentence #, Word, POS, Tag"
        self.n_sent = 1
        self.data = data
        agg_func = lambda s: [(w.lower(),p.lower(),t.lower()) for w,p,t in zip(s["Word"].values.tolist(),
                                                      s["POS"].values.tolist(),
                                                      s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(sentence):
        try:
            s = self.sentences[self.n_sent]
            self.n_sent+=1
            return s
        except IndexError:
            return None
            


        

In [None]:
getter = sentenceinfo(df)
sentences = getter.sentences

#### Define Mapping B/w Sentences and Tags

In [None]:
word2idx = {w:i for i,w in enumerate(words)}
tag2idx = {t:i for i,t in enumerate(tags)}

### Train - Test Data Preparation

In order to train the model, we need feed sentences of equal lengths. So, figuring out the maximum length of sentences

In [None]:
plt.figure(figsize = (5,3))
plt.hist([len(s) for s in sentences], bins = 50)
plt.xlabel("Length of Sentence")
plt.ylabel("Frequency")
plt.tight_layout()

Here, we can see that the most of the sentences are if length 20-22. And the sentence with maximum length has about 60 words.
So, taking it as 60.

In [None]:
## Padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 60

X = [[word2idx[w[0]] for w in s] for s in sentences]
y = [[tag2idx[t[2]] for t in s] for s in sentences]

In [None]:
X = pad_sequences(X,maxlen=max_len, padding="post",value = 0)
y = pad_sequences(y, maxlen = max_len, padding= "post", value = tag2idx["o"])
print(X.shape)
print(y.shape)

In [None]:
## Splitting the data for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y[0][:10]

#### Generating the Embedding Matrix Using Glove Vectors for the words in the corpus

In [None]:
embedding_dim = 100
glove_embedded_words = {}
embedding_matrix = np.zeros(shape = (vocab_size, embedding_dim))

with open("/kaggle/input/embedding-vector-glove100d/glove.6B.100d_set.txt",encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:],dtype = "float32")
        glove_embedded_words[word] = vector

In [None]:
for word, idx in word2idx.items():
    if word!="eos":
        embedding_vector = glove_embedded_words.get(word, np.zeros(embedding_dim))
    else:
        embedding_vector = np.full(embedding_dim, -1)
    embedding_matrix[idx] = embedding_vector

### Model Development

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, TimeDistributed, SpatialDropout1D, Dropout, Input

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size,
                   output_dim = embedding_dim,
                   input_length = max_len,
                   weights = [embedding_matrix],
                   trainable = False))

model.add(SpatialDropout1D(0.15))
model.add(Bidirectional(LSTM(units = 150, return_sequences = True, recurrent_dropout = 0.1)))
model.add(TimeDistributed(Dense(num_tags, activation = "softmax")))
model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(clipvalue = 0.5, learning_rate= 0.001)

In [None]:
model.compile(optimizer=opt,loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [None]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(patience = 3, verbose = 1,min_lr = 1e-4),
            tf.keras.callbacks.EarlyStopping(patience = 4, verbose = 1, restore_best_weights = True)]

In [None]:
batch_size = 32
epochs = 20
history = model.fit(x = X_train, y = y_train,batch_size = batch_size,
                   epochs = epochs,callbacks=callbacks,validation_data = (X_test, y_test))

In [None]:
import matplotlib.pyplot as plt
                                        
# # Access training history from the 'history' object
training_loss = history.history['loss']
validation_loss = history.history['val_loss']
training_accuracy = history.history['accuracy']  
validation_accuracy = history.history['val_accuracy'] 

# # Create an array representing the number of epochs
epochs = range(1, len(training_loss) + 1)
                                        
# # Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, training_loss, 'b', label='Training Loss')
plt.plot(epochs, validation_loss, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
                                        
# # Plot training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, training_accuracy, 'b', label='Training Accuracy')
plt.plot(epochs, validation_accuracy, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
y_predicted = model.predict(X_test, batch_size=32)

In [None]:
y_predicted = np.argmax(y_predicted, axis = -1)

In [None]:
y_test[3]

In [None]:
y_predicted[3]

In [None]:
## Comparing Predicted with True with metrics
y_test_flat = y_test.flatten()
y_predicted_flat = y_predicted.flatten()

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test_flat, y_predicted_flat)
print(report)

In [None]:
model.save(filepath = "./ner.h5")

#### Loading the Model

In [None]:
from keras.models import load_model
model_ner = load_model("./ner.h5")

In [None]:
## Testing for a random sentence

sent = "A handsome man named Ram lived in the house with his beautiful wife Sita in a country called India"
sent = sent.lower()

In [None]:
random_words = sent.split()
random_words.append("eos")

In [None]:
indexed = [[word2idx[i] for i in random_words]]
X_random = pad_sequences(indexed, maxlen=60, padding = "post", value = 0)

In [None]:
y_random = np.argmax(model_ner.predict(X_random), axis = -1)

In [None]:
tags_random = []
for i in y_random[0]:
    for key,value in tag2idx.items():
        if i==value:
            tags_random.append(key)

tags_random = tags_random[:len(random_words)-1]

In [None]:
random_df = pd.DataFrame({"Words": random_words[:-1], "Tags": tags_random})

In [None]:
random_df