In [1]:
import torch
import numpy as np
from preprocessing import cyrillize

from get_embeddings import get_noise_dampening_embedding, get_sub_word_tokenization_embedding, get_fast_text_embedding

import json
import random
from math import floor

In [2]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{
        'comment': cyrillize(d['comment']),
        'label': d['label'],
        'examples': [cyrillize(e) for e in d['examples']] if 'examples' in d else None
    } for d in json.load(f) if 'label' in d]

In [3]:
def gamma(model, embedded_s, embedded_pad, window_size):
    with torch.no_grad():
        s = embedded_s
        for _ in range(len(s), window_size):
            s = np.vstack([s, embedded_pad[0]])
        Z = model([s])
        return torch.argmax(Z[0]).item()

def test_model(model, embedding, testing_set, window_size):
    tp, fn, fp, tn = 0, 0, 0, 0
    embedded_pad = embedding('[pad]')
    for comment in testing_set:
        embedded_s = embedding(comment['comment'])
        if len(embedded_s) == 0:
            continue
        if gamma(model, embedded_s, embedded_pad, window_size):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def k_cross_validation(model, supervised_comments, embedding, window_size, k, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = []
        for comment in train_records:
            comment_embedding = embedding(comment['comment'])
            if len(comment_embedding) > 0:
                training_set.append((comment_embedding, 0 if comment['label'] == 'n' else 1))
            if comment['examples'] != None:
                training_set += [(embedding(e), 1) for e in comment['examples'] if len(embedding(e)) > 0]

        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]
        if balanced_classes:
            if len(positive_train) < floor(p_n_rate*len(negative_train)):
                positive_train = random.choices(positive_train, k=floor(p_n_rate*len(negative_train)))
            else:
                negative_train = random.choices(negative_train, k=floor(1/p_n_rate*len(positive_train)))
        print('pos : neg =', len(positive_train), ':', len(negative_train))

        train_sampled_data = positive_train + negative_train

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y)

        tp, fn, fp, tn = test_model(model, embedding, test_records, window_size)
        print(f"tp: {tp}, fn: {fn}, fp: {fp}, tn: {tn}")
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

In [4]:
batch_size = 32
input_size = 100
device = 'cuda'
pad_embedding = np.zeros((input_size))

In [5]:
class ConvolutionClassifier(torch.nn.Module):
    def __init__(self, filterSize, filterCount, classesCount, padTokenEmb):
        super(ConvolutionClassifier, self).__init__()
        self.filterSize = filterSize
        # convolution of the input
        self.paddedTokenEmb = padTokenEmb
        self.convolution = torch.nn.Conv1d(in_channels=input_size, out_channels=filterCount, kernel_size=filterSize)
        self.dropout = torch.nn.Dropout(0.5)
        self.classProjection = torch.nn.Linear(filterCount,classesCount)

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(max(len(s) for s in source), self.filterSize)
        sents_padded = np.array([ np.vstack([s] + (m-len(s))*[self.paddedTokenEmb]) for s in source])
        return torch.tensor(sents_padded, device=device)

    def forward(self, source):
        X = self.preparePaddedBatch(source)
        E = torch.transpose(X,1,2)
        ### Очаква се Е да е тензор с размер (batch_size, embed_size, max_sent_len)

        U,_ = torch.max(self.convolution(E.type(torch.float32)), dim=2)
        Z = self.classProjection(self.dropout(U))
        return Z

    def fit(self, X, Y):
        self.convolution.reset_parameters()
        self.classProjection.reset_parameters()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        self.train()
        for epoch in range(10):
            np.random.shuffle(idx)
            for b in range(0, len(idx), batch_size):
                batch = [ X[i] for i in idx[b:min(b+batch_size, len(idx))] ]
                Y = np.array(Y)
                target = torch.tensor(Y[idx[b:min(b+batch_size, len(idx))]], dtype = torch.long, device = device)

                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
        self.eval()

Sub word embedding after spell checking

In [6]:
sub_word_embedding = get_sub_word_tokenization_embedding(input_size, spell_corection=True)

Loading dictionary...
Processing dictionary...
Copied 165258 words to master dictionary...
Copied 1118365 hashes to master dictionary...


In [7]:
print_test_model(*k_cross_validation(
    ConvolutionClassifier(5, 400, 2, sub_word_embedding('[pad]')).to(device),
    supervised_comments,
    sub_word_embedding,
    window_size=5,
    k=10,
    balanced_classes=False,
    p_n_rate=1
))

pos : neg = 3221 : 2717
tp: 72, fn: 59, fp: 142, tn: 168
pos : neg = 3259 : 2702
tp: 12, fn: 104, fp: 21, tn: 304
pos : neg = 3265 : 2694
tp: 39, fn: 69, fp: 110, tn: 223
pos : neg = 3288 : 2685
tp: 2, fn: 97, fp: 12, tn: 330
pos : neg = 3278 : 2695
tp: 1, fn: 108, fp: 6, tn: 326
pos : neg = 3311 : 2675
tp: 86, fn: 3, fp: 325, tn: 27
pos : neg = 3185 : 2724
tp: 131, fn: 7, fp: 273, tn: 30
pos : neg = 2812 : 2845
tp: 8, fn: 251, fp: 6, tn: 176
pos : neg = 2994 : 2785
tp: 120, fn: 79, fp: 133, tn: 109
pos : neg = 3191 : 2727
tp: 10, fn: 131, fp: 15, tn: 285
pos : neg = 3526 : 3021
tp: 2, fn: 1, fp: 5, tn: 1
Precision: 0.3154800783801437
Recall: 0.3469827586206896
F1-score: 0.3304823811152925
Confusion Matrix:
                Predicted p Predicted n
Actual p          25.421   47.842
Actual n          55.158  104.158


0.3304823811152925

Sub word embedding

In [8]:
sub_word_embedding_no_spell_correction = get_sub_word_tokenization_embedding(input_size, spell_corection=False)

In [9]:
print_test_model(*k_cross_validation(ConvolutionClassifier(5, 400, 2, sub_word_embedding('[pad]')).to(device),
    supervised_comments,
    sub_word_embedding_no_spell_correction,
    window_size=5,
    k=10,
    balanced_classes=False,
    p_n_rate=1
))

pos : neg = 3221 : 2717
tp: 4, fn: 127, fp: 8, tn: 302
pos : neg = 3259 : 2702
tp: 0, fn: 116, fp: 1, tn: 324
pos : neg = 3265 : 2694
tp: 21, fn: 87, fp: 50, tn: 283
pos : neg = 3288 : 2685
tp: 0, fn: 99, fp: 0, tn: 342
pos : neg = 3278 : 2695
tp: 0, fn: 109, fp: 3, tn: 329
pos : neg = 3311 : 2675
tp: 29, fn: 60, fp: 76, tn: 276
pos : neg = 3185 : 2724
tp: 8, fn: 130, fp: 24, tn: 279
pos : neg = 2812 : 2845
tp: 73, fn: 186, fp: 50, tn: 132
pos : neg = 2994 : 2785
tp: 3, fn: 196, fp: 3, tn: 239
pos : neg = 3191 : 2727
tp: 45, fn: 96, fp: 89, tn: 211
pos : neg = 3526 : 3021
tp: 2, fn: 1, fp: 6, tn: 0
Precision: 0.37373737373737376
Recall: 0.1329022988505747
F1-score: 0.196078431372549
Confusion Matrix:
                Predicted p Predicted n
Actual p           9.737   63.526
Actual n          16.316  143.000


0.196078431372549

Noise dampening embedding

In [10]:
noise_dampening_embedding = get_noise_dampening_embedding(input_size, 'cuda')

In [11]:
print_test_model(*k_cross_validation(ConvolutionClassifier(5, 400, 2, noise_dampening_embedding('[pad]')).to(device),
    supervised_comments,
    noise_dampening_embedding,
    window_size=5,
    k=10,
    balanced_classes=False,
    p_n_rate=1
))

pos : neg = 3220 : 2714
tp: 5, fn: 126, fp: 3, tn: 307
pos : neg = 3258 : 2699
tp: 12, fn: 104, fp: 13, tn: 312
pos : neg = 3264 : 2691
tp: 30, fn: 78, fp: 50, tn: 283
pos : neg = 3287 : 2683
tp: 49, fn: 50, fp: 90, tn: 251
pos : neg = 3278 : 2692
tp: 0, fn: 109, fp: 1, tn: 331
pos : neg = 3310 : 2672
tp: 77, fn: 12, fp: 223, tn: 129
pos : neg = 3184 : 2723
tp: 2, fn: 136, fp: 2, tn: 299
pos : neg = 2811 : 2842
tp: 69, fn: 190, fp: 26, tn: 156
pos : neg = 2993 : 2782
tp: 6, fn: 193, fp: 3, tn: 239
pos : neg = 3190 : 2724
tp: 50, fn: 91, fp: 79, tn: 221
pos : neg = 3525 : 3018
tp: 3, fn: 0, fp: 6, tn: 0
Precision: 0.3792240300375469
Recall: 0.21767241379310343
F1-score: 0.27658603377453217
Confusion Matrix:
                Predicted p Predicted n
Actual p          15.947   57.316
Actual n          26.105  133.053


0.27658603377453217

Fast text embedding

In [12]:
fast_text_embedding = get_fast_text_embedding()

In [13]:
print_test_model(*k_cross_validation(ConvolutionClassifier(5, 400, 2, fast_text_embedding('[pad]')).to(device),
    supervised_comments,
    fast_text_embedding,
    window_size=5,
    k=10,
    balanced_classes=False,
    p_n_rate=1
))

pos : neg = 3220 : 2714
tp: 72, fn: 59, fp: 30, tn: 280
pos : neg = 3258 : 2699
tp: 20, fn: 96, fp: 4, tn: 321
pos : neg = 3264 : 2691
tp: 37, fn: 71, fp: 11, tn: 322
pos : neg = 3287 : 2683
tp: 8, fn: 91, fp: 1, tn: 340
pos : neg = 3278 : 2692
tp: 23, fn: 86, fp: 4, tn: 328
pos : neg = 3310 : 2672
tp: 86, fn: 3, fp: 293, tn: 59
pos : neg = 3184 : 2723
tp: 72, fn: 66, fp: 34, tn: 267
pos : neg = 2811 : 2842
tp: 118, fn: 141, fp: 16, tn: 166
pos : neg = 2993 : 2782
tp: 184, fn: 15, fp: 131, tn: 111
pos : neg = 3190 : 2724
tp: 125, fn: 16, fp: 178, tn: 122
pos : neg = 3525 : 3018
tp: 3, fn: 0, fp: 4, tn: 2
Precision: 0.5144429160935351
Recall: 0.5373563218390804
F1-score: 0.5256500351370343
Confusion Matrix:
                Predicted p Predicted n
Actual p          39.368   33.895
Actual n          37.158  122.000


0.5256500351370343