In [1]:
import torch
import numpy as np
from preprocessing import cyrillize

from get_embeddings import get_noise_dampening_embedding, get_sub_word_tokenization_embedding

import json
import random
from math import floor

In [2]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{
        'comment': cyrillize(d['comment']),
        'label': d['label'],
        'examples': [cyrillize(e) for e in d['examples']] if 'examples' in d else None
    } for d in json.load(f) if 'label' in d]

In [3]:
def gamma(model, embedded_s, embedded_pad, window_size):
    with torch.no_grad():
        s = embedded_s
        for _ in range(len(s), window_size):
            s = np.vstack([s, embedded_pad[0]])
        Z = model([s])
        return torch.argmax(Z[0]).item()

def test_model(model, embedding, testing_set, window_size):
    tp, fn, fp, tn = 0, 0, 0, 0
    embedded_pad = embedding('[pad]')
    for comment in testing_set:
        embedded_s = embedding(comment['comment'])
        if len(embedded_s) == 0:
            continue
        if gamma(model, embedded_s, embedded_pad, window_size):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def k_cross_validation(model, supervised_comments, embedding, window_size, k, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = []
        for comment in train_records:
            comment_embedding = embedding(comment['comment'])
            if len(comment_embedding) > 0:
                training_set.append((comment_embedding, 0 if comment['label'] == 'n' else 1))
            if comment['examples'] != None:
                training_set += [(embedding(e), 1) for e in comment['examples']]

        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]
        train_sampled_data = positive_train + negative_train
        if balanced_classes:
            train_sampled_data = negative_train
            train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*len(positive_train)))

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y)

        tp, fn, fp, tn = test_model(model, embedding, test_records, window_size)
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

In [4]:
batch_size = 32
input_size = 100
device = 'cuda'
pad_embedding = np.zeros((input_size))

In [6]:
sub_word_embedding = get_sub_word_tokenization_embedding(input_size)
# noise_dampening_embedding = get_noise_dampening_embedding(input_size, device)

In [22]:
class ConvolutionClassifier(torch.nn.Module):
    def __init__(self, filterSize, filterCount, classesCount, padTokenEmb):
        super(ConvolutionClassifier, self).__init__()
        # convolution of the input
        self.paddedTokenEmb = padTokenEmb
        self.convolution = torch.nn.Conv1d(in_channels=input_size, out_channels=filterCount, kernel_size=filterSize)
        self.dropout = torch.nn.Dropout(0.5)
        self.classProjection = torch.nn.Linear(filterCount,classesCount)

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents_padded = np.array([ np.vstack([s] + (m-len(s))*[self.paddedTokenEmb]) for s in source])
        # sents_padded = np.vstack([np.pad(s, (0, m - len(s)), 'constant', constant_values=self.paddedTokenEmb[0]) for s in source])
        return torch.tensor(sents_padded, device=device)

    def forward(self, source):

        X = self.preparePaddedBatch(source)
        E = torch.transpose(X,1,2)
        ### Очаква се Е да е тензор с размер (batch_size, embed_size, max_sent_len)

        U,_ = torch.max(self.convolution(E.type(torch.float32)), dim=2)
        Z = self.classProjection(self.dropout(U))
        return Z

    def fit(self, X, Y):
        self.convolution.reset_parameters()
        self.classProjection.reset_parameters()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        self.train()
        for epoch in range(10):
            np.random.shuffle(idx)
            for b in range(0, len(idx), batch_size):
                batch = [ X[i] for i in idx[b:min(b+batch_size, len(idx))] ]
                Y = np.array(Y)
                target = torch.tensor(Y[idx[b:min(b+batch_size, len(idx))]], dtype = torch.long, device = device)

                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
                # if b % 10 == 0:
                #     print(b, '/', len(idx), H.item())
        self.eval()

In [26]:
classModel = ConvolutionClassifier(7, 400, 2, sub_word_embedding('[pad]')).to(device)

In [27]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, window_size=7, k=10, balanced_classes=True, p_n_rate=1))

Precision: 0.32
Recall: 0.18045112781954886
F1-score: 0.23076923076923075
Confusion Matrix:
                Predicted p Predicted n
Actual p          16.000   72.667
Actual n          34.000  146.333


0.23076923076923075

Noise dampening embedding

In [None]:
classModel = ConvolutionClassifier(5, 400, 2, noise_dampening_embedding('[PAD]')).to(device)

In [None]:
print_test_model(*k_cross_validation(classModel, supervised_comments, noise_dampening_embedding, window_size=5, k=10, balanced_classes=True, p_n_rate=1))