In [None]:
!pip install chars2vec
import csv
import chars2vec
import re
import numpy as np
import keras.backend as K
from tensorflow.keras import datasets, layers, models, losses, callbacks, Model
import tensorflow as tf
from sklearn.model_selection import train_test_split
import statistics
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Colab Notebooks/'
from keras.utils.vis_utils import plot_model


Collecting chars2vec
[?25l  Downloading https://files.pythonhosted.org/packages/04/0a/8c327aae23e0532d239ec7b30446aca765eb5d9547b4c4b09cdd82e49797/chars2vec-0.1.7.tar.gz (8.1MB)
[K     |████████████████████████████████| 8.1MB 5.7MB/s 
[?25hBuilding wheels for collected packages: chars2vec
  Building wheel for chars2vec (setup.py) ... [?25l[?25hdone
  Created wheel for chars2vec: filename=chars2vec-0.1.7-cp37-none-any.whl size=8111096 sha256=c629a9b705168d016bfb627a770d8d49277f90363c5013aef2fd4bd705e6b9c1
  Stored in directory: /root/.cache/pip/wheels/97/b6/65/d7e778ef1213ec77d315aea0f536068b96e36cc94c02abbfde
Successfully built chars2vec
Installing collected packages: chars2vec
Successfully installed chars2vec-0.1.7
Mounted at /content/gdrive


In [None]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1 if len(predictions) == 0 else 0
    if len(predictions) == 0:
        return 0
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

In [None]:
def read_text_data(filename):
    """Reads csv file with python, text."""
    data = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            data.append(row['text'])
    csvfile.close()
    return data

In [None]:
def read_data_span(filename):
    """Reads csv file with python, span list."""
    data = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            data.append(row['span'])
    csvfile.close()
    return data

In [None]:
def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
texts = read_text_data('gdrive/My Drive/Colab Notebooks/Data/tsd_train_readable.csv')
spans = read_data_span('gdrive/My Drive/Colab Notebooks/Data/tsd_train_readable.csv')
texts.extend(read_text_data('gdrive/My Drive/Colab Notebooks/Data/tsd_trial_readable.csv'))
spans.extend(read_data_span('gdrive/My Drive/Colab Notebooks/Data/tsd_trial_readable.csv'))


processed_texts = []
processed_spans = []
print(f"Lengths equal: {len(texts)==len(spans)}" + "\n")

Lengths equal: True



In [None]:
# Preprocess data
c2v_model = chars2vec.load_model('eng_50')
word_limit = 1000
for i in range(0, len(texts)-1):
    to_use = True
    if len(texts[i]) > word_limit:
        to_use = False
    if texts[i] == "":
        to_use = False
    new_spans = [int(j) for j in spans[i][1:-1].split(", ")]
    if max(new_spans) > len(texts[i]) - 1:
        to_use = False
    if to_use:
        if spans[i] != []:
            full_span = [[0,0,1] for j in range(0, word_limit)]
            for char_offset in new_spans:
                full_span[char_offset] = [1,0,0]
            for j in range(0, len(texts[i])-1):
                if full_span[j][1] == 0 and full_span[j][2] == 1:
                    full_span[j] = [0,1,0]
        else:
            full_span = [[1,0,0] for j in range(0, len(texts[i]))]           
        processed_texts.append(texts[i])
        processed_spans.append(full_span)

In [None]:
# Get the maximim comment size (in no. of chars)
max_size = 0
for i in range(0, len(processed_texts)-1):
    if len(processed_texts[i]) > max_size:
        max_size = len(processed_texts[i])

In [None]:
max_size

1000

In [None]:
# Define the training arrays
train_Y = np.zeros(shape=(len(processed_spans), max_size, 3))
train_X = np.zeros(shape=(len(processed_texts), max_size, 50))         

In [None]:
del texts
del spans

In [None]:
# Build Train_X
for x, string in enumerate(processed_texts):
    for y, char in enumerate(string):
            char_vect = c2v_model.vectorize_words([char])
            train_X[x][y] = [word_vect for word_vect in char_vect[0]]

In [None]:
# Build train_Y
for x, label in enumerate(processed_spans):
    for y, output in enumerate(label):
        train_Y[x][y] = output

In [None]:
# Build test_X, Test_Y
train_X, test_X, train_Y, test_Y = train_test_split(train_X, train_Y, test_size=0.1, random_state=42)
class High_Score:
    def __init__(self):
        self.high_score = 0
    def get_high_score(self):
        return self.high_score
    def set_high_score(self, new_score):
        self.high_score = new_score
high_score = High_Score()

In [None]:
print(train_Y.shape)
print(train_X.shape)

(7102, 1000, 3)
(7102, 1000, 50)


In [None]:
del processed_texts
del processed_spans

In [None]:
class PredictionCallback(callbacks.Callback):    
    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(test_X)
        scores = []
        for x, pred in enumerate(y_pred):
            score = f1([j for j, i in enumerate(pred) if np.argmax(i) == 0], [j for j, i in enumerate(test_Y[x]) if np.argmax(i) == 0])
            scores.append(score)
        score = statistics.mean(scores)
        if score > high_score.get_high_score():
            high_score.set_high_score(score)
            model.save(f"{root_path}model_autoencoder_LSTM_checkpoint")
        print(f"F1 score: {score}")

In [None]:
# create architecture
#model = models.Sequential()
# vocabulary size — number of unique words in data
# length of vector with which each word is represented
#model.add(layers.Input(shape = train_X.shape[1:]))
# add an LSTM layer which contains 64 LSTM cells
# True — return whole sequence; False — return single output of the end of the sequence
#model.add(layers.Dropout(0.3))
#model.add(layers.GRU(128, return_sequences=True))
#model.add(layers.RepeatVector(1000))
#model.add(layers.GRU(256, return_sequences=True))
#model.add(layers.Dropout(0.3))
#model.add(layers.TimeDistributed(layers.Dense(3, activation='softmax')))
#compile model
#model.compile(loss      =  'categorical_crossentropy',
#                  optimizer =  'adam',
#                  metrics   =  ['acc'])
# check summary of the model
#model.summary()

In [None]:
model = models.Sequential()
model.add(layers.Input(shape = train_X.shape[1:]))
model.add(layers.Conv1D(filters=32, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Dropout(0.3))
model.add(layers.MaxPooling1D(strides=2))
model.add(layers.Conv1D(filters=64, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Dropout(0.3))
model.add(layers.MaxPooling1D(strides=2))
model.add(layers.Conv1D(filters=128, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Dropout(0.3))
model.add(layers.MaxPooling1D(strides=2))
model.add(layers.Conv1D(filters=256, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Bidirectional(layers.GRU(units=128, return_sequences=True)))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Bidirectional(layers.GRU(units=128, return_sequences=True)))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Conv1D(filters=256, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Conv1DTranspose(filters=128, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Dropout(0.3))
model.add(layers.UpSampling1D())
model.add(layers.Conv1DTranspose(filters=64, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Dropout(0.3))
model.add(layers.UpSampling1D())
model.add(layers.Conv1DTranspose(filters=32, kernel_size=9, strides=1, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.ELU())
model.add(layers.Dropout(0.3))
model.add(layers.UpSampling1D())
model.add(layers.Conv1D(filters=3, kernel_size=9, strides=1, padding='same', activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()
history = model.fit(train_X, train_Y, epochs=50, batch_size=32, callbacks=[PredictionCallback()])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_18 (Conv1D)           (None, 1000, 32)          14432     
_________________________________________________________________
batch_normalization_24 (Batc (None, 1000, 32)          128       
_________________________________________________________________
elu_24 (ELU)                 (None, 1000, 32)          0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 1000, 32)          0         
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 500, 32)           0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 500, 64)           18496     
_________________________________________________________________
batch_normalization_25 (Batc (None, 500, 64)          



INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.006034458730994374
Epoch 4/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.009553035793875234
Epoch 5/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.019456867928864433
Epoch 6/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.09873299805105297
Epoch 7/50
F1 score: 0.08910621200998453
Epoch 8/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.14981685866152983
Epoch 9/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.19539076504520045
Epoch 10/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.3014676041275092
Epoch 11/50
F1 score: 0.2932663708256974
Epoch 12/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.3188535414981981
Epoch 13/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.38936335029416913
Epoch 14/50
F1 score: 0.3413555259700044
Epoch 15/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.4293238553532709
Epoch 16/50
F1 score: 0.2996917584637766
Epoch 17/50
F1 score: 0.39550156497790884
Epoch 18/50
F1 score: 0.41654259606516864
Epoch 19/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.4320319610351293
Epoch 20/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.4383081610919049
Epoch 21/50
F1 score: 0.40163812121458176
Epoch 22/50
F1 score: 0.3294835535311839
Epoch 23/50
F1 score: 0.34891268127858543
Epoch 24/50
F1 score: 0.4295851314628016
Epoch 25/50
F1 score: 0.39142255614836524
Epoch 26/50
F1 score: 0.40951163518959793
Epoch 27/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.45637428828421656
Epoch 28/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.46612300881586316
Epoch 29/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.47350949800390874
Epoch 30/50
F1 score: 0.4142939207510077
Epoch 31/50




INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/model_autoencoder_LSTM_checkpoint/assets


F1 score: 0.5096803665217129
Epoch 32/50
F1 score: 0.475586323094409
Epoch 33/50
F1 score: 0.4412608308979224
Epoch 34/50
F1 score: 0.4751487403661602
Epoch 35/50
F1 score: 0.45884260042756636
Epoch 36/50
F1 score: 0.4569140100331053
Epoch 37/50
F1 score: 0.47104639630220396
Epoch 38/50

KeyboardInterrupt: ignored

In [None]:
model.save(f"{root_path}model_autoencoder_LSTM_50_epochs")
scores = []
pred_Y = model.predict(test_X)

for x, pred in enumerate(pred_Y):
    y_pred_f1_compatible = [j for j, i in enumerate(pred) if np.argmax(i) == 0]
    y_true_f1_compatible = [j for j, i in enumerate(test_Y[x]) if np.argmax(i) == 0]
    if test_Y[x] == []:
        y_pred_f1_compatible = []
    score = f1(y_pred_f1_compatible, y_true_f1_compatible)
    scores.append(score)

print('avg F1 %g' % statistics.mean(scores))


In [None]:
for x, pred in enumerate(pred_Y):
    score = f1([j for j, i in enumerate(pred) if np.argmax(i) == 0], [j for j, i in enumerate(test_Y[x]) if np.argmax(i) == 0])
    print(f"F1 score: {score}")
    print(f"Predicted span one_hot: {[np.argmax(i) for i in pred]}")
    print(f"Predicted span: {[j for j, i in enumerate(pred) if np.argmax(i) == 0]}")
    print(f"Ground truth span: {[j for j, i in enumerate(test_Y[x]) if np.argmax(i) == 0]}" + "\n")
    if x == 100:
      break
    