In [1]:
import torch
import numpy as np
from preprocessing import cyrillize

from get_embeddings import get_noise_dampening_embedding, get_sub_word_tokenization_embedding, get_fast_text_embedding

import json
import random
from math import floor

In [2]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{
        'comment': cyrillize(d['comment']),
        'label': d['label'],
        'examples': [cyrillize(e) for e in d['examples']] if 'examples' in d else None
    } for d in json.load(f) if 'label' in d]

In [3]:
def gamma(model, embedded_s, embedded_pad, window_size):
    with torch.no_grad():
        s = embedded_s
        for _ in range(len(s), window_size):
            s = np.vstack([s, embedded_pad[0]])
        Z = model([s])
        return torch.argmax(Z[0]).item()

def test_model(model, embedding, testing_set, window_size):
    tp, fn, fp, tn = 0, 0, 0, 0
    embedded_pad = embedding('[pad]')
    for comment in testing_set:
        embedded_s = embedding(comment['comment'])
        if len(embedded_s) == 0:
            continue
        if gamma(model, embedded_s, embedded_pad, window_size):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def k_cross_validation(model, supervised_comments, embedding, window_size, k, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = []
        for comment in train_records:
            comment_embedding = embedding(comment['comment'])
            if len(comment_embedding) > 0:
                training_set.append((comment_embedding, 0 if comment['label'] == 'n' else 1))
            if comment['examples'] != None:
                training_set += [(embedding(e), 1) for e in comment['examples']]

        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]
        if balanced_classes and len(positive_train) < floor(p_n_rate*len(negative_train)):
            positive_train = random.choices(positive_train, k=floor(p_n_rate*len(negative_train)))
        print(len(positive_train), len(negative_train))

        train_sampled_data = positive_train + negative_train

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y)

        tp, fn, fp, tn = test_model(model, embedding, test_records, window_size)
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

In [4]:
batch_size = 32
input_size = 100
device = 'cuda'
pad_embedding = np.zeros((input_size))

In [5]:
sub_word_embedding = get_sub_word_tokenization_embedding(input_size)
noise_dampening_embedding = get_noise_dampening_embedding(input_size, device)


In [15]:
class ConvolutionClassifier(torch.nn.Module):
    def __init__(self, filterSize, filterCount, classesCount, padTokenEmb):
        super(ConvolutionClassifier, self).__init__()
        self.filterSize = filterSize
        # convolution of the input
        self.paddedTokenEmb = padTokenEmb
        self.convolution = torch.nn.Conv1d(in_channels=input_size, out_channels=filterCount, kernel_size=filterSize)
        self.dropout = torch.nn.Dropout(0.5)
        self.classProjection = torch.nn.Linear(filterCount,classesCount)

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(max(len(s) for s in source), self.filterSize)
        sents_padded = np.array([ np.vstack([s] + (m-len(s))*[self.paddedTokenEmb]) for s in source])
        return torch.tensor(sents_padded, device=device)

    def forward(self, source):
        X = self.preparePaddedBatch(source)
        E = torch.transpose(X,1,2)
        ### Очаква се Е да е тензор с размер (batch_size, embed_size, max_sent_len)

        U,_ = torch.max(self.convolution(E.type(torch.float32)), dim=2)
        Z = self.classProjection(self.dropout(U))
        return Z

    def fit(self, X, Y):
        self.convolution.reset_parameters()
        self.classProjection.reset_parameters()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        self.train()
        for epoch in range(10):
            np.random.shuffle(idx)
            for b in range(0, len(idx), batch_size):
                batch = [ X[i] for i in idx[b:min(b+batch_size, len(idx))] ]
                Y = np.array(Y)
                target = torch.tensor(Y[idx[b:min(b+batch_size, len(idx))]], dtype = torch.long, device = device)

                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
        self.eval()

Sub word embedding after spell checking

In [7]:
sub_word_embedding = get_sub_word_tokenization_embedding(input_size, spell_corection=True)

Loading dictionary...
Processing dictionary...
Copied 165258 words to master dictionary...
Copied 1118365 hashes to master dictionary...


In [16]:
print_test_model(*k_cross_validation(
    ConvolutionClassifier(5, 400, 2, sub_word_embedding('[pad]')).to(device),
    supervised_comments,
    sub_word_embedding,
    window_size=5,
    k=10,
    balanced_classes=True,
    p_n_rate=1
))

3221 2717
3259 2702
3265 2694
3288 2685
3278 2695
3311 2675
3185 2724
2845 2845
2994 2785
3191 2727
3526 3021
Precision: 0.38251366120218583
Recall: 0.15086206896551724
F1-score: 0.21638330757341576
Confusion Matrix:
                Predicted p Predicted n
Actual p          11.053   62.211
Actual n          17.842  141.474


0.21638330757341576

Sub word embedding

In [17]:
sub_word_embedding = get_sub_word_tokenization_embedding(input_size, spell_corection=False)

In [18]:
print_test_model(*k_cross_validation(ConvolutionClassifier(5, 400, 2, sub_word_embedding('[pad]')).to(device),
    supervised_comments,
    sub_word_embedding,
    window_size=5,
    k=10,
    balanced_classes=True,
    p_n_rate=1
))

3221 2717
3259 2702
3265 2694
3288 2685
3278 2695
3311 2675
3185 2724
2845 2845
2994 2785
3191 2727
3526 3021
Precision: 0.34802158273381295
Recall: 0.2780172413793104
F1-score: 0.30910543130990414
Confusion Matrix:
                Predicted p Predicted n
Actual p          20.368   52.895
Actual n          38.158  121.158


0.30910543130990414

Noise dampening embedding

In [19]:
noise_dampening_embedding = get_noise_dampening_embedding(input_size, 'cuda')

In [20]:
print_test_model(*k_cross_validation(ConvolutionClassifier(5, 400, 2, noise_dampening_embedding('[pad]')).to(device),
    supervised_comments,
    sub_word_embedding,
    window_size=5,
    k=10,
    balanced_classes=True,
    p_n_rate=1
))

3221 2717
3259 2702
3265 2694
3288 2685
3278 2695
3311 2675
3185 2724
2845 2845
2994 2785
3191 2727
3526 3021
Precision: 0.3035143769968051
Recall: 0.4094827586206896
F1-score: 0.3486238532110092
Confusion Matrix:
                Predicted p Predicted n
Actual p          30.000   43.263
Actual n          68.842   90.474


0.3486238532110092

Fast text embedding

In [21]:
fast_text_embedding = get_fast_text_embedding()

In [22]:
print_test_model(*k_cross_validation(ConvolutionClassifier(5, 400, 2, fast_text_embedding('[pad]')).to(device),
    supervised_comments,
    sub_word_embedding,
    window_size=5,
    k=10,
    balanced_classes=True,
    p_n_rate=1
))

3221 2717
3259 2702
3265 2694
3288 2685
3278 2695
3311 2675
3185 2724
2845 2845
2994 2785
3191 2727
3526 3021
Precision: 0.30225711481844947
Recall: 0.2212643678160919
F1-score: 0.25549564496059723
Confusion Matrix:
                Predicted p Predicted n
Actual p          16.211   57.053
Actual n          37.421  121.895


0.25549564496059723