In [None]:
# Install extra-dependencies
! pip -q install git+https://www.github.com/keras-team/keras-contrib.git sklearn-crfsuite
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

In [None]:
BATCH_SIZE = 64
EPOCHS = 5
MAX_LEN = 75
EMBEDDING = 100
MAX_CHAR_LEN = 15
CHAR_EMBEDDING = 20

In [None]:
#reading the data
data = pd.read_csv("/kaggle/input/entity-annotated-corpus/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
print("Number of sentences: ", len(data.groupby(['Sentence #'])))

words = list(set(data["Word"].values))
n_words = len(words)
print("Number of words in the dataset: ", n_words)

tags = list(set(data["Tag"].values))
print("Tags:", tags)

n_tags = len(tags)
print("Number of Labels: ", n_tags)

print("What the dataset looks like:")
# Show the first 10 rows
data.head(n=10)

In [None]:
#data preprocessing
class SentenceGetter(object):
    """Class to Get the sentence in this format:
    [(Token_1, Part_of_Speech_1, Tag_1), ..., (Token_n, Part_of_Speech_1, Tag_1)]"""
    def __init__(self, data):
        """Args:
            data is the pandas.DataFrame which contains the above dataset"""
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        """Return one sentence"""
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)
sent = getter.get_next()
print('This is what a sentence looks like:')
print(sent)

In [None]:
# Vocabulary Key:word -> Value:token_index
# The first 2 entries are reserved for PAD and UNK
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The word Obama is identified by the index: {}".format(word2idx["Obama"]))
print("The labels B-geo(which defines Geopraphical Enitities) is identified by the index: {}".format(tag2idx["B-geo"]))

In [None]:
# Character-level Vocabulary Key:char -> Value:char_index
# The first 2 entries are reserved for PAD and UNK
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1 # Unknown characters
char2idx["PAD"] = 0 # Padding

In [None]:
# Character-level Vocabulary Key:char_index -> Value:char
char2idx = {"PAD": 0, "UNK": 1}
for word in words:
    for char in word:
        if char not in char2idx:
            char2idx[char] = len(char2idx)

In [None]:
sentences = getter.sentences

In [None]:
# Convert each word to a list of character indices
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(MAX_LEN):
        word_seq = []
        for j in range(MAX_CHAR_LEN):
            try:
                char = sentence[i][0][j]
                word_seq.append(char2idx[char])
            except:
                word_seq.append(char2idx["PAD"])
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import concatenate
# Convert each sentence from list of Token to list of word_index
X_word = [[word2idx[w[0]] for w in s] for s in sentences]

# Padding each sentence to have the same lenght
X_word = pad_sequences(maxlen=MAX_LEN, sequences=X_word, padding="post", value=word2idx["PAD"])
# Padding each sentence to have the same length
X_char = pad_sequences(maxlen=MAX_LEN, sequences=X_char, padding="post", value=char2idx["PAD"])

# combine word and char sequences into one input array
#X = np.c_[X_word.reshape(len(X_word),-1), X_char.reshape(len(X_char),-1)]
#X = concatenate([X_word,X_char])

In [None]:
# Convert Tag/Label to tag_index
y = [[tag2idx[w[2]] for w in s] for s in sentences]
# Padding each sentence to have the same lenght
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])
from keras.utils import to_categorical
# One-Hot encode
y = [to_categorical(i, num_classes=n_tags+1) for i in y]  # n_tags+1(PAD)

from sklearn.model_selection import train_test_split
X_word_tr, X_word_te, X_char_tr, X_char_te, y_tr, y_te = train_test_split(X_word,X_char, y, test_size=0.2)
#X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape
#print('Raw Sample: ', ' '.join([w[0] for w in sentences[0]]))
#print('Raw Label: ', ' '.join([w[2] for w in sentences[0]]))
#print('After processing, sample:', X[0])
#print('After processing, labels:', y[0])

In [None]:
# Load the pre-trained GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Define the model
input_dim = n_words + 2
embedding_matrix = np.zeros((input_dim, 100))
for i,word in idx2word.items():
    embedding_vector = embeddings_index.get(word.lower)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Conv1D, MaxPooling1D, Flatten
#from tensorflow.keras.layers import 
from keras_contrib.layers import CRF
import keras as k

In [None]:
# Model definition
# define word-level input
input = Input(shape=(MAX_LEN,))

model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  weights=[embedding_matrix],input_length=MAX_LEN)(input)  # default: 100-dim embedding

# Creating character-level embeddings
input_char = Input(shape=(MAX_LEN, MAX_CHAR_LEN,))
char_emb = TimeDistributed(Embedding(len(char2idx), CHAR_EMBEDDING))(input_char)
char_emb = TimeDistributed(Conv1D(filters=32, kernel_size=3, padding="same", activation="relu"))(char_emb)
char_emb = TimeDistributed(MaxPooling1D(pool_size=2))(char_emb)
char_emb = TimeDistributed(Flatten())(char_emb)
char_emb = Dropout(0.5)(char_emb)

# Concatenate word and character embeddings
model = concatenate([model, char_emb])

model = Bidirectional(LSTM(units=50, return_sequences=True,
                           dropout=0.5, 
                           recurrent_dropout=0.5,
                          kernel_initializer=k.initializers.he_normal()))(model)  # variational biLSTM
model = LSTM(units=50 * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)
model = TimeDistributed(Dense(100, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output
model = Model(inputs=[input, input_char], outputs=out)
#Optimiser 
adam = k.optimizers.Adam(lr=0.0008, beta_1=0.9, beta_2=0.999)
# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint
# Saving the best model only
filepath="ner-bi-lstm-td-model-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

# Fit the best model
history = model.fit([X_word_tr,X_char_tr], np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=1, callbacks=callbacks_list)

In [None]:
# Plot the graph 
plt.style.use('ggplot')

def plot_history(history):
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)

In [None]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out
test_pred = model.predict([X_word_te,X_char_te], verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [None]:
! pip install seqeval

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

In [None]:
! pip install sklearn_crfsuite

In [None]:
from  sklearn_crfsuite.metrics import flat_classification_report  
report = flat_classification_report(y_pred=pred_labels, y_true=test_labels)
print(report)

In [None]:
sentence = "President Obama became the first sitting American president to visit Hiroshima"

In [None]:
import re
import string
re_tok = re.compile(f"([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])")

sentence = re_tok.sub(r"  ", str(sentence)).split()
padded_sentence = sentence + [word2idx["PAD"]]*(MAX_LEN - len(sentence))


In [None]:
X_char = []
for sentence in padded_sentence:
    temp=[]
    for i in sentence:
        temp.append(char2idx[i])
    temp.append(char2idx["PAD"]*(MAX_CHAR_LEN - len(temp)))
    X_char.append(temp)

In [None]:
padded_sentence = [word2idx.get(w, 0) for w in padded_sentence]

pred = model.predict(np.array([padded_sentence,X_char]))
pred = np.argmax(pred, axis=-1)

retval = ""
for w, p in zip(sentence, pred[0]):
    retval = retval + "{:15}: {:5}".format(w, idx2tag[p]) + "\n"
print(retval)

In [None]:
from keras.models import load_model
crf2 = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
model2 = load_model('/kaggle/working/ner-bi-lstm-td-model-0.99.hdf5', custom_objects={'CRF':CRF,'crf_loss':crf2.loss_function, 'crf_viterbi_accuracy':crf2.accuracy})

In [None]:
pred = model2.predict(np.array([padded_sentence]))
pred = np.argmax(pred, axis=-1)

retval = ""
for w, p in zip(sentence, pred[0]):
    retval = retval + "{:15}: {:5}".format(w, idx2tag[p]) + "\n"
print(retval)