In [1]:
import spacy
import numpy as np
import tensorflow as tf
from time import time
from tensorflow.keras.metrics import Mean
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.data import Dataset
from tensorflow_addons.layers import CRF
from tensorflow_addons.text import crf_log_likelihood
from tensorflow_addons.metrics import F1Score
from tensorflow_addons.optimizers import SGDW, Lookahead, RectifiedAdam
from tensorflow.keras.optimizers import RMSprop
from metrics import f1score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from pandas import read_csv
from ast import literal_eval
from itertools import chain

nlp = spacy.load("en_core_web_sm")

In [2]:
LEARNING_RATE = 1.5
BATCH_SIZE = 128
EPOCHS = 128
MAX_LEN = 128
WEIGHT_DECAY = 1e-4

tag2idx = {'B': 0, 'I': 1, 'O': 2, 'E': 3, 'S': 4, '<': 5, ">":6, "$": 7}
pos2idx = {"NOUN": 0, "PROPN": 1, "VERB": 2, "ADJ": 3, "OTHER": 4, "<START>": 5, "<END>": 6, "<PAD>": 7}

convert_to_one_hot = lambda values: np.eye(len(tag2idx), dtype=int)[values].tolist()

In [3]:
def build_crf_model(tag_size=len(tag2idx)):
    x = Input(shape=(MAX_LEN, tag_size), dtype=tf.int32)
    decoded_sequence, potentials, sequence_length, kernel = CRF(tag_size)(x)
    return Model(inputs=x, outputs=[decoded_sequence, potentials, sequence_length, kernel])

In [4]:
def crf_loss_func(potentials, y, sequence_lengths, kernel):
    crf_likelihood, _ = crf_log_likelihood(potentials, y, sequence_lengths, kernel)
    crf_loss = tf.reduce_mean(-crf_likelihood)
    return crf_loss

In [5]:
def post_pad_sequences(sequences, maxlen=None, start="<START>", end="<END>", pad="<PAD>", return_masks=True):
    L = max([len(x) for x in sequences])
    if maxlen is None or maxlen-2 > L:
        sequences = [([start] + x + [end] + [pad]*(L-len(x))) for x in sequences]
        masks = [[(0 if x == pad else 1) for x in s] for s in sequences] if return_masks else None
    else:
        sequences = [([start] + (x[:maxlen-2] if len(x) >= maxlen-2 else x) + [end] + [pad]*(max(maxlen-len(x)-2, 0))) for x in sequences]
        masks = [[(0 if x == pad else 1) for x in s] for s in sequences] if return_masks else None
    return {"seq": sequences, "mask": masks}

In [6]:
data = read_csv("../data/train_290818.txt", 
                sep=" ",
                header=None, 
                encoding="utf-8").values.tolist()

text = [literal_eval(words) for (words, _, _) in data]
text = [[token.pos_ for token in nlp(' '.join(s))] for s in text]
text = [[p if (p == "NOUN" or p == "PROPN" or p == "VERB" or p == "ADJ") else "OTHER" for p in sent] for sent in text]
text = post_pad_sequences(text, maxlen=MAX_LEN)
encoded_input = [convert_to_one_hot([pos2idx[p] for p in sent]) for sent in text["seq"]]

labels = [[l.split('-')[0] for l in literal_eval(labels)] for (_, labels, _) in data]
labels = post_pad_sequences(labels, maxlen=MAX_LEN, return_masks=False, start='<', end='>', pad='$')
extended_labels = [[tag2idx[l] for l in lbls] for lbls in labels["seq"]]

train_dataset = Dataset.from_tensor_slices((encoded_input, text["mask"], extended_labels)).batch(batch_size=BATCH_SIZE)                                                                   

In [7]:
data = read_csv("../data/test_290818.txt", 
                sep=" ",
                header=None, 
                encoding="utf-8").values.tolist()

text = [literal_eval(words) for (words, _, _) in data]
text = [[token.pos_ for token in nlp(' '.join(s))] for s in text]
text = [[p if (p == "NOUN" or p == "PROPN" or p == "VERB" or p == "ADJ") else "OTHER" for p in sent] for sent in text]
text = post_pad_sequences(text, maxlen=MAX_LEN)
encoded_input = [convert_to_one_hot([pos2idx[p] for p in sent]) for sent in text["seq"]]

labels = [[l.split('-')[0] for l in literal_eval(labels)] for (_, labels, _) in data]
labels = post_pad_sequences(labels, maxlen=MAX_LEN, start='<', end='>', pad='$', return_masks=False)
extended_labels = [[tag2idx[l] for l in lbls] for lbls in labels["seq"]]

test_dataset = Dataset.from_tensor_slices((encoded_input, text["mask"], extended_labels)).batch(batch_size=BATCH_SIZE)                                                             

In [8]:
model = build_crf_model()

optimizer = Lookahead(RMSprop(learning_rate=LEARNING_RATE))

In [9]:
def train_step(x, mask, y):
    losses, f1scores = [], []
    with tf.GradientTape() as tape:
        yp, potentials, sequence_length, kernel = model(x, mask=mask)        
        yp = tf.reverse(yp, axis=[-1])
        loss = crf_loss_func(potentials, y, sequence_length, kernel)
        f1scores.append(f1score(y.numpy(), yp.numpy()))
        losses.append(loss)
                        
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return np.mean(loss), np.mean(f1scores)

In [None]:
for epoch in range(1, EPOCHS+1):
    start_time = time()
    for (ids, masks, lbls) in train_dataset:
        loss, f1 = train_step(ids, masks, lbls)
    print(f"{(time() - start_time):.2f}")

avg_loss: 44.475, avg_f1score: 0.006
avg_loss: 37.376, avg_f1score: 0.000
avg_loss: 45.180, avg_f1score: 0.000
avg_loss: 40.039, avg_f1score: 0.000
avg_loss: 29.757, avg_f1score: 0.000
avg_loss: 17.358, avg_f1score: 0.000
avg_loss: 10.158, avg_f1score: 0.000
avg_loss: 22.331, avg_f1score: 0.000
avg_loss: 12.543, avg_f1score: 0.000
avg_loss: 22.184, avg_f1score: 0.000
avg_loss: 8.444, avg_f1score: 0.000
avg_loss: 9.193, avg_f1score: 0.000
avg_loss: 9.440, avg_f1score: 0.000
avg_loss: 30.675, avg_f1score: 0.000
avg_loss: 12.960, avg_f1score: 0.000
avg_loss: 14.116, avg_f1score: 0.000
avg_loss: 38.041, avg_f1score: 0.000
avg_loss: 14.265, avg_f1score: 0.000
12.74
avg_loss: 12.631, avg_f1score: 0.000
avg_loss: 25.858, avg_f1score: 0.000
avg_loss: 26.452, avg_f1score: 0.000
avg_loss: 14.378, avg_f1score: 0.000
avg_loss: 23.974, avg_f1score: 0.000
avg_loss: 19.450, avg_f1score: 0.000
avg_loss: 13.946, avg_f1score: 0.000
avg_loss: 11.737, avg_f1score: 0.000
avg_loss: 17.549, avg_f1score: 0.00

avg_loss: 14.527, avg_f1score: 0.000
avg_loss: 11.338, avg_f1score: 0.000
avg_loss: 30.530, avg_f1score: 0.000
avg_loss: 29.738, avg_f1score: 0.000
avg_loss: 21.770, avg_f1score: 0.000
avg_loss: 12.943, avg_f1score: 0.003
avg_loss: 16.774, avg_f1score: 0.000
avg_loss: 7.436, avg_f1score: 0.000
avg_loss: 9.552, avg_f1score: 0.000
avg_loss: 14.536, avg_f1score: 0.000
avg_loss: 10.507, avg_f1score: 0.030
avg_loss: 16.188, avg_f1score: 0.000
avg_loss: 2.570, avg_f1score: 0.000
11.91
avg_loss: 13.626, avg_f1score: 0.000
avg_loss: 14.280, avg_f1score: 0.000
avg_loss: 28.333, avg_f1score: 0.000
avg_loss: 17.785, avg_f1score: 0.008
avg_loss: 9.030, avg_f1score: 0.000
avg_loss: 10.419, avg_f1score: 0.000
avg_loss: 15.658, avg_f1score: 0.000
avg_loss: 18.309, avg_f1score: 0.000
avg_loss: 27.219, avg_f1score: 0.000
avg_loss: 16.185, avg_f1score: 0.000
avg_loss: 12.580, avg_f1score: 0.010
avg_loss: 18.546, avg_f1score: 0.000
avg_loss: 8.724, avg_f1score: 0.000
avg_loss: 14.929, avg_f1score: 0.000


avg_loss: 23.163, avg_f1score: 0.000
avg_loss: 11.125, avg_f1score: 0.000
avg_loss: 6.563, avg_f1score: 0.000
avg_loss: 18.762, avg_f1score: 0.000
avg_loss: 31.668, avg_f1score: 0.000
avg_loss: 23.418, avg_f1score: 0.000
avg_loss: 15.761, avg_f1score: 0.000
avg_loss: 7.129, avg_f1score: 0.000
11.95
avg_loss: 14.512, avg_f1score: 0.000
avg_loss: 25.198, avg_f1score: 0.000
avg_loss: 27.748, avg_f1score: 0.000
avg_loss: 17.340, avg_f1score: 0.000
avg_loss: 10.652, avg_f1score: 0.000
avg_loss: 15.375, avg_f1score: 0.000
avg_loss: 10.669, avg_f1score: 0.000
avg_loss: 15.575, avg_f1score: 0.000
avg_loss: 21.938, avg_f1score: 0.000
avg_loss: 12.800, avg_f1score: 0.000
avg_loss: 5.709, avg_f1score: 0.000
avg_loss: 6.336, avg_f1score: 0.000
avg_loss: 7.901, avg_f1score: 0.000
avg_loss: 14.987, avg_f1score: 0.000
avg_loss: 23.434, avg_f1score: 0.000
avg_loss: 13.649, avg_f1score: 0.015
avg_loss: 6.310, avg_f1score: 0.000
avg_loss: 2.166, avg_f1score: 0.000
12.17
avg_loss: 9.922, avg_f1score: 0.0