In [None]:
import csv
import chars2vec
import re
import numpy as np
import keras.backend as K
from tensorflow.keras import datasets, layers, models, losses, callbacks
from sklearn.model_selection import train_test_split
import statistics

In [None]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1 if len(predictions) == 0 else 0
    if len(predictions) == 0:
        return 0
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

In [None]:
def read_data_text(filename):
    """Reads csv file with python, text."""
    data = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            data.append(row['text'])
    csvfile.close()
    return data

In [None]:
def read_data_spans(filename):
    """Reads csv file with python, text."""
    data = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        count = 0
        for row in reader:
            if row['spans'] == '[]' or row['spans'] == []:
                data.append([])
            else:
                data.append([int(j) for j in row['spans'][1:-1].split(", ")])
    csvfile.close()
    return data

Read the test text and labels

In [None]:
texts = read_data_text('data/tsd_test_readable.csv')
spans = read_data_spans('data/tsd_test_readable.csv')

Inspect the text

In [None]:
texts

Inspect the spans

In [None]:
spans

Load the test data into a numpy array

In [None]:
test_X = np.zeros(shape=(len(texts), 1024, 50))
c2v_model = chars2vec.load_model('eng_50')
for x, string in enumerate(texts):
    for y, char in enumerate(string):
            char_vect = c2v_model.vectorize_words([char])
            test_X[x][y] = [word_vect for word_vect in char_vect[0]]

In [51]:
model = models.load_model(f"DeconvNet_deep_model_300_epochs_sko")
y_pred = model.predict(test_X)

In [57]:
def fix_word_boundaries(span, text):
    # "You fucking Moron you silly cunt" [6,7,8,9,10,11,12,13,14,15,16,28,29,30]
    # [4,5,6,7,8,9,10,11,12,13,14,15,16,28,29,30,31]
    seperated_text = []
    word = ''
    new_span = []
    current_word_span = []
    toxic_word = False
    for n, char in enumerate(text):
        if n in span:
            toxic_word = True
        if char == ' ':
            seperated_text.append(word)
            seperated_text.append(' ')
            word = ''
            if toxic_word:
                new_span.extend(current_word_span)
                current_word_span = []
                toxic_word = False
            else:
                current_word_span = []
                toxic_word = False
        else:
            word += char
            current_word_span.append(n)
        if n == len(text) - 1:
            seperated_text.append(word)
            if toxic_word:
                new_span.extend(current_word_span)
    print(new_span)
    return new_span

#fix_word_boundaries([6,7,8,9,10,11,12,13,14,15,16,28,29,30], "You fucking Moron you silly cunt")

In [55]:
scores = []
for x, pred in enumerate(y_pred):
    y_pred_f1_compatible = [j for j, i in enumerate(pred) if np.argmax(i) == 0]
    #y_pred_f1_compatible = fix_word_boundaries(y_pred_f1_compatible, texts[x])
    y_true_f1_compatible = spans[x]
    score = f1(y_pred_f1_compatible, y_true_f1_compatible)
    scores.append(score)

print('avg F1 %g' % statistics.mean(scores))
#0.559889

avg F1 0.539761
