In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/ner_dataset.csv', encoding='unicode_escape')

In [3]:
print(data.head())

    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [4]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    token2index = {}
    index2token = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
        
    index2token = {idx: tok for idx, tok in enumerate(vocab)}
    token2index = {tok: idx for idx, tok in enumerate(vocab)}
    
    return token2index, index2token

In [5]:
token2index, index2token = get_dict_map(data, 'token')
tag2index, index2tag = get_dict_map(data, 'tag')

In [6]:
data['word_idx'] = data['Word'].map(token2index)
data['tag_idx'] = data['Tag'].map(tag2index)

In [7]:
data_fillna = data.fillna(method='ffill', axis=0)
data_group = data_fillna.groupby(['Sentence #'], as_index=False)[['Word', 'POS', 'Tag', 'word_idx', 'tag_idx']].agg(lambda x: list(x))

  data_fillna = data.fillna(method='ffill', axis=0)


In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [9]:
def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    tokens = data_group['word_idx'].to_list()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token-1)

    tags = data_group['tag_idx'].to_list()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value=tag2index['O'])
    n_tags = len(tag2index)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_, tags_, test_size=0.25, random_state=2020)

    print(
        f'''train_tokens length: {len(train_tokens)}
        \ntrain_tags length: {len(train_tags)}
        \ntest_tokens length: {len(test_tokens)}
        \ntest_tags length: {len(test_tags)}
        \nval_tokens length: {len(val_tokens)}
        \nval_tags length: {len(val_tags)}'''
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

In [10]:
train_tokens, test_tokens, val_tokens, train_tags, test_tags, val_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372
        
train_tags length: 32372
        
test_tokens length: 4796
        
test_tags length: 4796
        
val_tokens length: 10791
        
val_tags length: 10791


In [11]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [12]:
input_dim = len(list(set(data['Word'].to_list()))) + 1
output_dim = 64
input_len = max([len(s) for s in data_group['word_idx'].tolist()])
n_tags = len(tag2index)

In [13]:
def get_bilstm_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_len))
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'))
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(TimeDistributed(Dense(n_tags, activation='relu')))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [14]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [15]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()



In [16]:
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 4s/step - accuracy: 0.7856 - loss: 2.4446 - val_accuracy: 0.9681 - val_loss: 0.3555
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 4s/step - accuracy: 0.9676 - loss: 0.3632 - val_accuracy: 0.9681 - val_loss: 0.2628
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 4s/step - accuracy: 0.9674 - loss: 0.2891 - val_accuracy: 0.9680 - val_loss: 0.2341
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 4s/step - accuracy: 0.9674 - loss: 0.2642 - val_accuracy: 0.9681 - val_loss: 0.2084
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 4s/step - accuracy: 0.9676 - loss: 0.2397 - val_accuracy: 0.9681 - val_loss: 0.1971
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 5s/step - accuracy: 0.9676 - loss: 0.2269 - val_accuracy: 0.9681 - val_loss: 0.1889
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 8s/step - accuracy: 0.9677 - loss:

KeyboardInterrupt: 