In [199]:
import pandas as pd

In [200]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/data_ner.csv")

In [201]:
data = data.iloc[:-1]

In [202]:
fact= {
'O' : "Outside", 
'B-geo' : "Geographical Entity",
'B-gpe' : "Geopolitical Entity", 
'B-per' : "PERSON",
'I-geo' : "Geographical Entity",                  # B- Begining 
'B-org' : "ORGANIZATION",                         # I - Inside
'I-org' : "ORGANIZATION", 
'B-tim' : "TIME",
'B-art' : "Artifact", 
'I-art' : "Artifact", 
'I-per' : "PERSON", 
'I-gpe' : "Geopolitical Entity", 
'I-tim' : "TIME", 
'B-nat' : "Natural Phenomenon", 
'B-eve' : "Event",
'I-eve' : "Event",
'I-nat' : "Natural Phenomenon"
 }

In [203]:
data.groupby(by="Tag")["Word"].agg(lambda x: list(x))

Tag
B-art    [Nuclear, Saltillo, Pentastar, Chrysler, Dodge...
B-eve    [2012, Games, Games, 2008, Operation, Gulf, Au...
B-geo    [London, Iraq, Hyde, Britain, Brighton, Iraq, ...
B-gpe    [British, English, Britain, British, Iran, Ira...
B-nat    [H5N1, H5N1, Jing, Jing, H5N1, SARS, Severe, H...
B-org    [Labor, International, IAEA, European, U.N., B...
B-per    [Bush, President, Thomas, President, Prophet, ...
B-tim    [Wednesday, Wednesday, Tuesday, Wednesday, Wed...
I-art    [Non-Proliferation, V-6, Simple, Life, Morning...
I-eve    [Summer, Olympics, Olympic, Medusa, War, Open,...
I-geo    [Park, State, State, Delta, Arab, West, Fronti...
I-gpe    [States, Korea, Binh, Ababa, City, Lanka, Kore...
I-nat    [Jing, Jing, Acute, Respiratory, Syndrome, Kat...
I-org    [Party, Atomic, Energy, Agency, Union, Securit...
I-per    [Mahmoud, Ahmadinejad, Horbach, Abdullahi, Yus...
I-tim    [8, 1, 2, 3, of, 2005, 25, ,, 1995, 7, 17, 200...
O        [Thousands, of, demonstrators, have, marche

In [204]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [205]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  import sys


In [206]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 1176 
train_tokens length: 1176 
test_tokens length: 175 
test_tags: 175 
val_tokens: 393 
val_tags: 393


In [207]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [208]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [110]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [111]:
def train_model(X, y, model):
    loss = list()
    for i in range(30):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=64, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [112]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 62, 64)            412800    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 62, 128)           66048     
_________________________________________________________________
lstm_9 (LSTM)                (None, 62, 64)            49408     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 62, 17)            1105      
Total params: 529,361
Trainable params: 529,361
Non-trainable params: 0
_________________________________________________________________


In [114]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [196]:
testing = 'John Lee is the chief of CBSE.'
testing = nltk.word_tokenize(testing)
for i in range(len(testing)):
  try : 
    testing[i] = token2idx[testing[i]]
  except :
    testing[i] = len(token2idx)-1

a = pad_sequences([testing], maxlen=62, dtype='int32', padding='post', value= 6448)

prediction = model_bilstm_lstm.predict(a)
pred = tensorflow.argmax(prediction, 2)
for i in range(len(pred.numpy()[0])):
  if a[0][i] != len(token2idx)-1:
    print(idx2token[a[0][i]], idx2tag[pred.numpy()[0][i]])

John B-per
Lee I-per
is O
the O
chief O
of O
. O


In [197]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('John Lee is the chief of CBSE.')
displacy.render(text, style = 'ent', jupyter=True)