In [101]:
import torch
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize
from sklearn.decomposition import TruncatedSVD

import json
import random
from math import floor

In [119]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{'comment': cyrillize(d['comment']), 'label': d['label']} for d in json.load(f) if 'label' in d]

In [6]:
def get_sub_word_tokenization_embedding(dim=100):
    tokenizer = Tokenizer.from_file("data/tokenizer_comments.json")
    token2ind = tokenizer.get_vocab()
    ind2token = lambda x: tokenizer.id_to_token(x)

    with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
        unsupervised_comments = json.load(f)

    tokenized_unsupervised_comments = [tokenizer.encode(c).tokens for c in unsupervised_comments]

    n_words = tokenizer.get_vocab_size()
    X=np.zeros((n_words,n_words))
    for s in ["[UNK]", "[PAD]", "[STR]", "[END]"]:
        X[token2ind[s], token2ind[s]] = 1
    for comment in tokenized_unsupervised_comments:
        for wi in range(len(comment)):
            if comment[wi] not in token2ind: continue
            i=token2ind[comment[wi]]
            for k in range(1,4+1):
                if wi-k>=0 and comment[wi-k] in token2ind:
                    j=token2ind[comment[wi-k]]
                    X[i,j] += 1
                if wi+k<len(comment) and comment[wi+k] in token2ind:
                    j=token2ind[comment[wi+k]]
                    X[i,j] += 1

    svd = TruncatedSVD(n_components=dim, n_iter=10)
    svd.fit(X)
    X_reduced = svd.transform(X)

    return lambda comment: np.stack([X_reduced[token2ind[token]] for token in tokenizer.encode(comment).tokens])

In [95]:
def gamma(model, embedding, s, window_size):
    with torch.no_grad():
        s = embedding(s)
        for _ in range(len(s), window_size):
            s = np.vstack([s, embedding('[PAD]')[0]])
        Z = model([s])
        return torch.argmax(Z[0]).item()

def test_model(model, embedding, testing_set, window_size):
    tp, fn, fp, tn = 0, 0, 0, 0
    for comment in testing_set:
        if gamma(model, embedding, comment['comment'], window_size):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def k_cross_validation(model, supervised_comments, embedding, window_size, k, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = [
            (embedding(comment['comment']), 0 if comment['label'] == 'n' else 1)
            for comment in train_records
        ]
        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]
        train_sampled_data = positive_train + negative_train
        if balanced_classes:
            train_sampled_data = negative_train
            train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*len(positive_train)))

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y)

        tp, fn, fp, tn = test_model(model, embedding, test_records, window_size)
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

In [7]:
batch_size = 32
input_size = 100
device = 'cuda'

embedding = get_sub_word_tokenization_embedding(input_size)

In [104]:
class ConvolutionClassifier(torch.nn.Module):
    def __init__(self, filterSize, filterCount, classesCount, padTokenEmb):
        super(ConvolutionClassifier, self).__init__()
        # convolution of the input
        self.paddedTokenEmb = padTokenEmb
        self.convolution = torch.nn.Conv1d(in_channels=input_size, out_channels=filterCount, kernel_size=filterSize)
        self.dropout = torch.nn.Dropout(0.5)
        self.classProjection = torch.nn.Linear(filterCount,classesCount)

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents_padded = [ np.vstack([s] + (m-len(s))*[self.paddedTokenEmb]) for s in source ]
        return torch.tensor(sents_padded, dtype=torch.long, device=device)

    def forward(self, source):
        X = self.preparePaddedBatch(source)
        E = torch.transpose(X,1,2)
        ### Очаква се Е да е тензор с размер (batch_size, embed_size, max_sent_len)

        U,_ = torch.max(self.convolution(E.type(torch.float32)), dim=2)
        Z = self.classProjection(self.dropout(U))
        return Z

    def fit(self, X, Y):
        self.convolution.reset_parameters()
        self.classProjection.reset_parameters()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        self.train()
        for epoch in range(10):
            np.random.shuffle(idx)
            for b in range(0, len(idx), batch_size):
                batch = [ X[i] for i in idx[b:min(b+batch_size, len(idx))] ]
                Y = np.array(Y)
                target = torch.tensor(Y[idx[b:min(b+batch_size, len(idx))]], dtype = torch.long, device = device)

                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
                # if b % 10 == 0:
                #     print(b, '/', len(idx), H.item())
        self.eval()

In [105]:
classModel = ConvolutionClassifier(5, 400, 2, embedding('[PAD]')).to(device)
print_test_model(*k_cross_validation(classModel, supervised_comments, embedding, window_size=5, k=10, balanced_classes=True, p_n_rate=1))

Precision: 0.25726744186046513
Recall: 0.2161172161172161
F1-score: 0.2349037823490378
Confusion Matrix:
                Predicted p Predicted n
Actual p          10.412   37.765
Actual n          30.059  102.765


0.2349037823490378

In [124]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    examples = [{'comment': cyrillize(example), 'label': 'p'}
                for comment in json.load(f)
                if 'examples' in comment
                for example in comment['examples']
    ]

supervised_comments_and_examples = supervised_comments + examples

print_test_model(*k_cross_validation(classModel, supervised_comments_and_examples, embedding, window_size=5, k=10, balanced_classes=False))

Precision: 0.4868055555555555
Recall: 0.35137844611528823
F1-score: 0.40815138282387187
Confusion Matrix:
                Predicted p Predicted n
Actual p          53.923   99.538
Actual n          56.846  116.846


0.40815138282387187