In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("../input/ner-dataset/ner_datasetreference.csv", encoding="latin1")

In [2]:
data = data.fillna(method="ffill")

In [3]:
data

In [4]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

In [5]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

In [6]:
class get_sentences(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = get_sentences(data)

In [8]:
sent = getter.get_next()

In [9]:
print(sent)

In [10]:
sentences = getter.sentences

In [11]:
max_len = 75
max_len_char = 10
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [12]:
print(word2idx["Obama"])
print(tag2idx["B-geo"])

In [13]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                if(sentence[i][0][j] in char2idx):
                    word_seq.append(char2idx.get(sentence[i][0][j]))
                else:
                    word_seq.append(char2idx.get("UNK"))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags + 1) for i in y]

from sklearn.model_selection import train_test_split
X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)

In [14]:
X_word_tr.shape, X_word_te.shape

In [None]:
len(X_char_tr), len(X_char_te[7][1])

In [15]:
X_word_tr[1]

In [16]:
X_char_tr[1]

In [17]:
print(n_words,n_chars,n_tags)

In [18]:
X_word_tr.shape

In [19]:
np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char)).shape

In [22]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git
from keras_contrib.layers import CRF

In [30]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
# input and embedding for words
word_in = Input(shape=(max_len,),name='word_input_')
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len, mask_zero=True)(word_in)

# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,),name='char_input')
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char, mask_zero=True))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(Bidirectional(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5)))(emb_char)

# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
lstm = LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6)(x)
out = TimeDistributed(Dense(50, activation="relu"))(lstm)

crf = CRF(n_tags + 1)  # CRF layer
out = crf(out)

model = Model([word_in, char_in], out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

In [24]:
print(model.layers[9].weights)

In [25]:
np.array(y_tr).shape

In [31]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr),
                    batch_size=32, epochs=10, validation_split=0.1, verbose=1)
hist = pd.DataFrame(history.history)

In [37]:
hist.columns

In [47]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12, 12))
plt.plot(hist["crf_viterbi_accuracy"])
plt.plot(hist["val_crf_viterbi_accuracy"])
plt.legend(['crf_viterbi_accuracy','val_crf_viterbi_accuracy'])
plt.show()


In [39]:
y_pred = model.predict([X_word_te,
                        np.array(X_char_te).reshape((len(X_char_te),
                                                     max_len, max_len_char))])

In [40]:
def label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = label(y_pred)
test_labels = label(y_te)

In [45]:
hist = pd.DataFrame(history.history)

plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["loss"])
plt.plot(hist["val_loss"])
plt.legend(['loss','val_loss'])
plt.show()

In [46]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

def ner_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    tagset = list(sorted(set(lb.classes_)))
    tagset = tagset[:-2]
    print(tagset)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        digits=4
    )

print(ner_report(test_labels, pred_labels))