**Named Entity** means anything that is a real-world object such as a person, a place, any organisation, any product which has a name. For example – “My name is Aman, and I and a Machine Learning Trainer”. In this sentence the name “Aman”, the field or subject “Machine Learning” and the profession “Trainer” are named entities.

In [1]:
!wget https://github.com/amankharwal/Website-data/blob/master/ner_dataset.csv

--2023-09-19 17:53:28--  https://github.com/amankharwal/Website-data/blob/master/ner_dataset.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14573 (14K) [text/plain]
Saving to: ‘ner_dataset.csv.1’


2023-09-19 17:53:28 (27.1 MB/s) - ‘ner_dataset.csv.1’ saved [14573/14573]



In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/content/ner_dataset.csv', encoding='unicode_escape')
data.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72706 entries, 0 to 72705
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence #  3300 non-null   object
 1   Word        72706 non-null  object
 2   POS         72706 non-null  object
 3   Tag         72706 non-null  object
dtypes: object(4)
memory usage: 2.2+ MB


**Data Preparation for Neural Networks**

In [5]:
vocab = list(set(data['Word'].to_list()))

In [6]:
vocab[0:5]

['contracting', 'highly', 'pollution', 'insight', 'Chung']

In [7]:
from itertools import chain

In [8]:
def get_dict_map(data,token_or_tag):
  tok2idx = {}
  idx2tok = {}

  if token_or_tag == 'token':
    vocab = list(set(data['Word']))
  else:
    vocab = list(set(data['Tag']))

  idx2tok = { id:token for id, token in enumerate(vocab)}
  tok2idx = { token:id for id, token in enumerate(vocab)}
  return tok2idx,idx2tok

In [9]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [10]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
data_fillna.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,6921,5
1,Sentence: 1,of,IN,O,7770,5
2,Sentence: 1,demonstrators,NNS,O,436,5
3,Sentence: 1,have,VBP,O,6938,5
4,Sentence: 1,marched,VBN,O,5402,5


In [11]:
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(


In [12]:
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(


In [13]:
data_group.head()


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[6921, 7770, 436, 6938, 5402, 6909, 2157, 8968...","[5, 5, 5, 5, 5, 5, 16, 5, 5, 5, 5, 5, 16, 5, 5..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[8773, 1525, 882, 5200, 1361, 8968, 7239, 1074...","[14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[6114, 4780, 4510, 5974, 3990, 4985, 6412, 337...","[5, 5, 0, 5, 5, 5, 5, 5, 16, 5, 5, 5, 5, 5, 8,..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[8892, 8597, 3267, 4216, 8884, 8297, 7663, 221...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]"
4,Sentence: 1001,"[The, global, financial, crisis, has, left, Ic...","[DT, JJ, JJ, NN, VBZ, VBN, NNP, POS, NN, IN, N...","[O, O, O, O, O, O, B-org, O, O, O, O, O]","[8561, 4090, 1890, 2334, 7131, 8597, 4113, 532...","[5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5]"


In [14]:
data_group["Word"][0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [15]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [16]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'])))
    n_tag = len(list(set(data['Tag'])))

    #Pad tokens (X var)
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

In [17]:
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)


train_tokens length: 2227 
train_tokens length: 2227 
test_tokens length: 330 
test_tags: 330 
val_tokens: 743 
val_tags: 743


**Training Neural Network for Named Entity Recognition (NER)**

In [18]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [19]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [20]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [21]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [22]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 64)            589888    
                                                                 
 bidirectional (Bidirection  (None, 70, 128)           66048     
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 70, 64)            49408     
                                                                 
 time_distributed (TimeDist  (None, 70, 17)            1105      
 ributed)                                                        
                                                                 
Total params: 706449 (2.69 MB)
Trainable params: 706449 (2.69 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp("""
Hi, I am Md. Foysal Sheikh.
I am a CSE student.
I studied at Daffodil Internation University.
""")
displacy.render(text, style = 'ent', jupyter=True)