In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

plt.style.use("ggplot")
%matplotlib inline

In [None]:
data_input = '/path/to/file.csv'
df = pd.read_csv(data_input)
df.ffill(inplace=True)
df.tail()

In [None]:
aggregate_function = lambda s: [(w, p, t) for w, p, t in zip(
    list(s['Word'].values),
    list(s['POS Tag'].values),
    list(s['NER Tag'].values)
)]

sentences = df.groupby("Sentence ID").apply(aggregate_function)

In [None]:
plt.hist([len(s) for s in sentences], bins=50)
plt.show()

max_length = max(len(s) for s in sentences)
print("Maximum Length: ", max_length)

In [None]:
words = list(set(df["Word"].values))
tags = list(set(df["NER Tag"].values))

words.append("__PAD__")

n_words = len(words)
n_tags = len(tags)

word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}

In [None]:
print(tags)
print(n_words)
print(n_tags)

In [None]:
X = [[word2id[triple_tuple[0]] for triple_tuple in sent] for sent in sentences]
y = [[tag2id[triple_tuple[2]] for triple_tuple in sent] for sent in sentences]

X = pad_sequences(X, maxlen=max_length, padding='post', value=n_words-1)
y = pad_sequences(y, maxlen=max_length, padding='post', value=tag2id['OUT'])
y = [to_categorical(i, n_tags) for i in y]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True, test_size=0.20)

In [None]:
model = Sequential([
                    Embedding(input_dim=n_words, output_dim=64, input_length=max_length),
                    Dropout(0.1),
                    Bidirectional(LSTM(
                        128, 
                        activation='tanh', 
                        return_sequences=True, 
                        recurrent_activation='sigmoid', 
                        use_bias=True,
                        )),
                    TimeDistributed(Dense(
                        n_tags,
                        activation='softmax'
                    ))
])

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, np.array(y_train), batch_size=64, epochs=3, validation_split=0.25, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)

In [None]:
plt.figure(figsize=(12,12))
plt.plot(hist["accuracy"])
plt.plot(hist["val_accuracy"])
plt.show()