In [2]:
import csv
import chars2vec
import re
import numpy as np
import keras.backend as K
from tensorflow.keras import datasets, layers, models, losses, callbacks
from sklearn.model_selection import train_test_split
import statistics

In [3]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1 if len(predictions) == 0 else 0
    if len(predictions) == 0:
        return 0
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

In [4]:
def read_data_text(filename):
    """Reads csv file with python, text."""
    data = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            data.append(row['text'])
    csvfile.close()
    return data

In [5]:
def read_data_spans(filename):
    """Reads csv file with python, text."""
    data = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            if row['spans'] == '[]' or row['spans'] == []:
                data.append([])
            else:
                data.append([int(j) for j in row['spans'][1:-1].split(", ")])
    csvfile.close()
    return data

Read the test text and labels

In [10]:
texts = read_data_text('data/tsd_test_readable.csv')
spans = read_data_spans('data/tsd_test_readable.csv')

Inspect the text

In [11]:
texts

["That's right. They are not normal. And I am starting from the premise that they are ABNORMAL. Proceed wth the typical racist, bigot, sexist rubbish.\nThanks!",
 '"Watch people die from taking away their healthcare"\n\nDING DING DING!  Winner of stupid post of the day award!',
 'tens years ago i contacted the PDR and suggested that the time might be good to work with alaska on building a gas line.. alaska rejected them without even consideration despite china being flush with cash and hungry for gas.. and set up another infamous boondoggle.. the transcanada-exxon rip off that we are still paying for and have yet to receive anything of value.. hundreds of millions of dollars on studies.. and buyouts.. i hope china says f you alaska.. you are nothing but ignorant people..',
 'The parallels between the ANC and the Sicilian Mafia are glaring. The ANC has always been run by a few "families" who treat the state as \'turf\' ; as just one big piggy bank for their self-enrichment. The governme

Inspect the spans

In [12]:
spans

[[84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146],
 [81, 82, 83, 84, 85, 86],
 [],
 [],
 [],
 [129, 130, 131, 132, 133, 134],
 [35, 36, 37, 38, 39, 40, 41, 42, 43],
 [55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129],
 [37, 38, 39, 40, 41, 158, 159, 160, 161, 204, 205, 206, 207, 208],
 [94, 95, 96, 97],
 [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64],
 [201, 202, 203, 204, 205, 206, 207, 208, 209, 210],
 [30,
  31,
  32,
  33,
  34,
  35,
  36,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  52,
  53,
  54,
  55,
  56,
  57,
  58],
 [134, 135, 136, 137, 138, 139],
 [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [],
 [2, 3, 4, 5],
 [],
 [],
 [0, 1, 2, 3, 4],
 [12, 13, 14, 

Load the test data into a numpy array

In [13]:
test_X = np.zeros(shape=(len(texts), 1024, 50))
c2v_model = chars2vec.load_model('eng_50')
for x, string in enumerate(texts):
    for y, char in enumerate(string):
            char_vect = c2v_model.vectorize_words([char])
            test_X[x][y] = [word_vect for word_vect in char_vect[0]]

In [17]:
model = models.load_model(f"DeconvNet_deep_checkpoint_sko")
y_pred = model.predict(test_X)

In [18]:
scores = []
for x, pred in enumerate(y_pred):
    y_pred_f1_compatible = [j for j, i in enumerate(pred) if np.argmax(i) == 0]
    y_true_f1_compatible = spans[x]
    score = f1(y_pred_f1_compatible, y_true_f1_compatible)
    scores.append(score)

print('avg F1 %g' % statistics.mean(scores))

avg F1 0.559889
