In [None]:
import spacy
import numpy as np
import tensorflow as tf
from tensorflow.keras.metrics import Mean
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.data import Dataset
from tensorflow_addons.layers import CRF
from tensorflow_addons.text import crf_log_likelihood
from tensorflow_addons.metrics import F1Score
from tensorflow_addons.optimizers import Lookahead, SGDW
from metrics import f1score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from pandas import read_csv
from ast import literal_eval
from itertools import chain
from utils import post_pad_sequences
from tensorflow.keras.utils import to_categorical

nlp = spacy.load("en_core_web_sm")

In [None]:
LEARNING_RATE = 1.5
BATCH_SIZE = 128
EPOCHS = 128
MAX_LEN = 128
WEIGHT_DECAY = 1e-3

tag2idx = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, '<': 5, ">":6, "$": 7}
pos2idx = {"NOUN": 0, "PROPN": 1, "VERB": 2, "ADJ": 3, "OTHER": 4, "<START>": 5, "<END>": 6, "<PAD>": 7}

convert_to_one_hot = lambda values: np.eye(len(tag2idx), dtype=int)[values].tolist()

In [None]:
def build_crf_model(tag_size=len(tag2idx)):
    x = Input(shape=(MAX_LEN, tag_size), dtype=tf.int32)
    decoded_sequence, potentials, sequence_length, kernel = CRF(tag_size)(x)
    return Model(inputs=x, outputs=[decoded_sequence, potentials, sequence_length, kernel])

In [None]:
def crf_loss_func(potentials, y, sequence_lengths, kernel):
    crf_likelihood, _ = crf_log_likelihood(potentials, y, sequence_lengths, kernel)
    crf_loss = tf.reduce_mean(-crf_likelihood)
    return crf_loss

In [None]:
data = read_csv("../data/train_290818.txt", 
                sep=" ",
                header=None, 
                encoding="utf-8").values.tolist()

text = [literal_eval(words) for (words, _, _) in data]
text = [[token.pos_ for token in nlp(' '.join(s))] for s in text]
text = [[p if (p == "NOUN" or p == "PROPN" or p == "VERB" or p == "ADJ") else "OTHER" for p in sent] for sent in text]
text = post_pad_sequences(text, maxlen=MAX_LEN, return_masks=True)
encoded_input = np.array([to_categorical([pos2idx[p] for p in sent], num_classes=len(tag2idx)) for sent in text["seq"]])

labels = [[l.split('-')[0] for l in literal_eval(labels)] for (_, labels, _) in data]
labels = post_pad_sequences(labels, maxlen=MAX_LEN, return_masks=False, start='<', end='>', pad='$')
extended_labels = [[tag2idx[l] for l in lbls] for lbls in labels["seq"]]

train_dataset = Dataset.from_tensor_slices((encoded_input, text["mask"], extended_labels)).batch(batch_size=BATCH_SIZE)                                                                   

In [None]:
data = read_csv("../data/test_290818.txt", 
                sep=" ",
                header=None, 
                encoding="utf-8").values.tolist()

text = [literal_eval(words) for (words, _, _) in data]
text = [[token.pos_ for token in nlp(' '.join(s))] for s in text]
text = [[p if (p == "NOUN" or p == "PROPN" or p == "VERB" or p == "ADJ") else "OTHER" for p in sent] for sent in text]
text = post_pad_sequences(text, maxlen=MAX_LEN, return_masks=True)
encoded_input = np.array([to_categorical([pos2idx[p] for p in sent], num_classes=len(tag2idx)) for sent in text["seq"]])

labels = [[l.split('-')[0] for l in literal_eval(labels)] for (_, labels, _) in data]
labels = post_pad_sequences(labels, maxlen=MAX_LEN, start='<', end='>', pad='$', return_masks=False)
extended_labels = [[tag2idx[l] for l in lbls] for lbls in labels["seq"]]

test_dataset = Dataset.from_tensor_slices((encoded_input, text["mask"], extended_labels)).batch(batch_size=BATCH_SIZE)                                                             

In [None]:
model = build_crf_model()

optimizer = Lookahead(SGDW(learning_rate=LEARNING_RATE, momentum=0.9, nesterov=True, weight_decay=weight_decay))

In [None]:
def train_step(x, mask, y):
    losses, f1scores = [], []
    with tf.GradientTape() as tape:
        yp, potentials, sequence_length, kernel = model(x, mask=mask, training=True)        
        yp = tf.reverse(yp, axis=[-1])
        loss = crf_loss_func(potentials, y, sequence_length, kernel)
        f1scores.append(f1score(y.numpy(), yp.numpy()))
        losses.append(loss)            
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss, f1scores

In [None]:
def inference_step(x, mask, y):
    losses, f1scores = [], []
    yp, potentials, sequence_length, kernel = model(x, mask=mask, training=False)        
    yp = tf.reverse(yp, axis=[-1])
    loss = crf_loss_func(potentials, y, sequence_length, kernel)
    f1scores.append(f1score(y.numpy(), yp.numpy()))
    losses.append(loss)            
    return loss, f1scores

In [None]:
for epoch in range(1, EPOCHS+1):
    losses, f1scores = [], []
    for batch in train_dataset:
        loss, f1 = train_step(*batch)
        f1scores.append(f1)
        losses.append(loss)
    print(f"loss: {np.mean(loss):.5f}, f1score: {np.mean(f1scores):.5f}")