In [None]:
!pip install comet_ml
!pip install seqeval[gpu]
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
# Download word2vec embedding
!wget https://raw.githubusercontent.com/Esukhia/NER/master/ner_tagger/embeddings/bo_word2vec_v1?token=AD3KLUEFS7ORNGDC2I3FAIS5NTO2S
!mv bo_word2vec_v1\?token\=AD3KLUEFS7ORNGDC2I3FAIS5NTO2S bo_word2vec_v1
!ls

In [None]:
# Comet Experiment Setup
from comet_ml import Experiment

In [None]:
model_name = 'Bi-LSTM_CRF_Word2Vec'
version = 3
dataset_name = 'citation'
exp_name = f'{model_name}_v{version}'

In [None]:
%%writefile .env
COMET_API_KEY=vIyyGJwVBzI3hxt2layIDbyye

In [None]:
exp = Experiment(project_name="ner-citation-model", auto_output_logging='simple')
exp.set_name(exp_name)

In [None]:
# Dataset
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors

In [None]:
WV = KeyedVectors.load_word2vec_format('bo_word2vec_v1', binary=False)

In [None]:
data = pd.read_csv(f"../input/ner-lists-87_citations-1000_{dataset_name}.csv")
data = data.fillna(method="ffill")
data.tail(10)

In [None]:
words = list(set(data["word"].values))
n_words = len(words)
print("# words:", n_words)

In [None]:
tags = list(set(data["tag"].values))
tags.append('PAD')
n_tags = len(tags)
print('# tags:', n_tags)

In [None]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(), s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

In [None]:
sentences = getter.sentences

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [None]:
plt.hist([len(s) for s in sentences], bins=50)
plt.xlabel('Sentence Length')
plt.ylabel('No. Sentences')
exp.log_figure('Sentence length distribution', plt)

In [None]:
max_len = 150
word2idx = {}
word2idx['<UNK>'] = len(WV.vocab)
word2idx['<PAD>'] = len(WV.vocab) + 1
for w in words:
    if WV.vocab.get(w): word2idx[w] = WV.vocab.get(w).index
    else: word2idx[w] = word2idx['<UNK>']
idx2word = {i: t for t, i in word2idx.items()}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: t for t, i in tag2idx.items()}

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx['<PAD>'])

In [None]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["PAD"])
y = np.array([to_categorical(i, num_classes=n_tags) for i in y])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
random_state = 45
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=random_state)
print('No. Training dataset:', X_train.shape[0])
print('No. Validation dataset:', X_valid.shape[0])
print('No. Test dataset:', X_test.shape[0])

In [None]:
# Model Definitaion -> Bi-LSTM
from keras import backend as K
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.losses.crf_losses import crf_loss
from keras.callbacks import Callback
from seqeval.metrics import f1_score, classification_report, precision_score, recall_score

In [None]:
params = {
    #dataset
    'n_train': X_train.shape[0],
    
    # sizes hps
    'vocab_size': len(WV.vocab) + 2, #<UNK> and <PAD>
    'max_len': max_len,
    'num_classes': n_tags,
    'embedding_size': 150,
    
    # models hps
    'optimizer': 'adam',
    'model_type': model_name,
    'lstm_layer_1_units': 100,
    'dense_layer_units': 50,
    'dense_layer_activation': 'relu',
    'dropout': 0.1,
    'recurrent_dropout': 0.1,
    
    # training hps
    'batch_size': 32,
    'epochs': 50
}

exp.log_parameters(params)

In [None]:
#Initialize the embeddings with word2vec
embedding_matrix = np.random.rand(params['vocab_size'], params['embedding_size']).astype('float32')
for word, i in word2idx.items():
    if word in WV.vocab:
        embedding_matrix[i] = WV[word]

In [None]:
len(WV.vocab), embedding_matrix.shape, embedding_matrix.dtype

In [None]:
assert np.array_equal(WV['དང་'], embedding_matrix[WV.vocab.get('དང་').index])

In [None]:
input = Input(shape=(params['max_len'],))
model = Embedding(input_dim=params['vocab_size'], output_dim=params['embedding_size'], input_length=params['max_len'],
                  mask_zero=False, weights=[embedding_matrix], trainable=False)(input)
model = Dropout(params['dropout'])(model)
model = Bidirectional(LSTM(units=params['lstm_layer_1_units'], return_sequences=True, recurrent_dropout=params['recurrent_dropout']))(model)
model = TimeDistributed(Dense(params['dense_layer_units'], activation=params['dense_layer_activation']))(model)
crf = CRF(params['num_classes'])  # CRF layer
out = crf(model)

In [None]:
model = Model(input, out)
model.summary()

In [None]:
model.compile(optimizer=params['optimizer'], loss=crf_loss)

In [None]:
# Training
class F1Metrics(Callback):

    def __init__(self, id2label, pad_value=0, validation_data=None, digits=4):
        """
        Args:
            id2label (dict): id to label mapping.
            (e.g. {1: 'B-LOC', 2: 'I-LOC'})
            pad_value (int): padding value.
            digits (int or None): number of digits in printed classification report
              (use None to print only F1 score without a report).
        """
        super(F1Metrics, self).__init__()
        self.id2label = id2label
        self.pad_value = pad_value
        self.validation_data = validation_data
        self.digits = digits
        self.is_fit = validation_data is None

    def convert_idx_to_name(self, y, array_indexes):
        """Convert label index to name.
        Args:
            y (np.ndarray): label index 2d array.
            array_indexes (list): list of valid index arrays for each row.
        Returns:
            y: label name list.
        """
        y = [[self.id2label[idx] for idx in row[row_indexes]] for
             row, row_indexes in zip(y, array_indexes)]
        return y

    def predict(self, X, y):
        """Predict sequences.
        Args:
            X (np.ndarray): input data.
            y (np.ndarray): tags.
        Returns:
            y_true: true sequences.
            y_pred: predicted sequences.
        """
        y_pred = self.model.predict_on_batch(X)

        # reduce dimension.
        y_true = np.argmax(y, -1)
        y_pred = np.argmax(y_pred, -1)

        non_pad_indexes = [np.nonzero(y_true_row != self.pad_value)[0] for y_true_row in y_true]

        y_true = self.convert_idx_to_name(y_true, non_pad_indexes)
        y_pred = self.convert_idx_to_name(y_pred, non_pad_indexes)

        return y_true, y_pred

    def score(self, y_true, y_pred):
        """Calculate f1 score.
        Args:
            y_true (list): true sequences.
            y_pred (list): predicted sequences.
        Returns:
            score: f1 score.
        """
        score = f1_score(y_true, y_pred)
        print(' - valid_f1: {:04.2f}'.format(score * 100))
        return score

    def on_epoch_end(self, epoch, logs={}):
        if self.is_fit:
            self.on_epoch_end_fit(epoch, logs)
        else:
            self.on_epoch_end_fit_generator(epoch, logs)

    def on_epoch_end_fit(self, epoch, logs={}):
        X = self.validation_data[0]
        y = self.validation_data[1]
        y_true, y_pred = self.predict(X, y)
        score = self.score(y_true, y_pred)
        logs['valid_f1'] = score

    def on_epoch_end_fit_generator(self, epoch, logs={}):
        y_true = []
        y_pred = []
        for X, y in self.validation_data:
            y_true_batch, y_pred_batch = self.predict(X, y)
            y_true.extend(y_true_batch)
            y_pred.extend(y_pred_batch)
        score = self.score(y_true, y_pred)
        logs['valid_f1'] = score

In [None]:
f1_metrics = F1Metrics(idx2tag, tag2idx['PAD'])

In [None]:
with exp.train():
    history = model.fit(X_train, y_train, 
                        batch_size=params['batch_size'], 
                        epochs=params['epochs'], 
                        validation_data=(X_valid, y_valid),
                        callbacks=[f1_metrics],
                        verbose=1)

In [None]:
# Evaluate
def to_char(x, y, pred):
    for x, y, p in zip([idx2word[x] for x in x], [idx2tag[x] for x in y], [idx2tag[x] for x in pred]):
        print(x, y, p)

def evaluate(X_test, y_test):
    y_pred = model.predict(X_test)
    
    # reduce dimension.
    y_true = np.argmax(y_test, -1)
    y_pred = np.argmax(y_pred, -1)
    
    #to_char(X_test[1], y_true[1], y_pred[1]) 
    
    # remove PAD labels
    non_pad_indexes = [np.nonzero(y_true_row != tag2idx['PAD'])[0] for y_true_row in y_true]
    y_true = f1_metrics.convert_idx_to_name(y_true, non_pad_indexes)
    y_pred = f1_metrics.convert_idx_to_name(y_pred, non_pad_indexes)
    
    # compute f1 score
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    print(classification_report(y_true, y_pred))
    return f1, precision, recall, y_pred

In [None]:
with exp.test():
    f1, precision, recall, y_pred = evaluate(X_test, y_test)
    metrics = {
        'f1': '{:04.2f}'.format(f1 * 100),
        'precision': '{:04.2f}'.format(precision * 100),
        'recall': '{:04.2f}'.format(recall * 100),
    }
    exp.log_metrics(metrics)

In [None]:
#save model
model_fn = f'{exp_name}.h5'
model.save(model_fn)
exp.log_asset(file_data=model_fn, file_name=model_fn)

In [None]:
def show(i):
    p = model.predict(np.array([X_test[i]]))
    p = np.argmax(p, axis=-1)
    true = np.argmax(y_test[i], -1)
    print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
    print(30 * "=")
    for w, t, pred in zip(X_test[i], true, p[0]):
        if idx2word[w] != '<PAD>':
            print("{:15}: {:5} {}".format(idx2word[w], idx2tag[t], idx2tag[pred]))

In [None]:
show(20)

In [None]:
show(32)

In [None]:
show(50)

In [None]:
exp.end()