In [2]:
import torch
import torch.nn as nn
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize
from sklearn.decomposition import TruncatedSVD

import json
import random
import math

In [3]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [
        {
            'comment': cyrillize(d['comment']),
            'label': d['label'],
            'examples': d['examples'] if 'examples' in d else None
        }
        for d in json.load(f) if 'label' in d
    ]

n_supervised_p = len([s for s in supervised_comments if s['label'] == 'p'])
n_supervised_n = len([s for s in supervised_comments if s['label'] == 'p'])

Sub word tokenization embedding

In [4]:
def get_sub_word_tokenization_embedding(dim=100):
    tokenizer = Tokenizer.from_file("data/tokenizer_comments.json")
    token2ind = tokenizer.get_vocab()
    ind2token = lambda x: tokenizer.id_to_token(x)

    with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
        unsupervised_comments = json.load(f)

    tokenized_unsupervised_comments = [tokenizer.encode(c).tokens for c in unsupervised_comments]

    n_words = tokenizer.get_vocab_size()
    X=np.zeros((n_words,n_words))
    for s in ["[UNK]", "[PAD]", "[STR]", "[END]"]:
        X[token2ind[s], token2ind[s]] = 1
    for comment in tokenized_unsupervised_comments:
        for wi in range(len(comment)):
            if comment[wi] not in token2ind: continue
            i=token2ind[comment[wi]]
            for k in range(1,4+1):
                if wi-k>=0 and comment[wi-k] in token2ind:
                    j=token2ind[comment[wi-k]]
                    X[i,j] += 1
                if wi+k<len(comment) and comment[wi+k] in token2ind:
                    j=token2ind[comment[wi+k]]
                    X[i,j] += 1

    svd = TruncatedSVD(n_components=dim, n_iter=10)
    svd.fit(X)
    X_reduced = svd.transform(X)

    return lambda comment: np.stack([X_reduced[token2ind[token]] for token in tokenizer.encode(comment).tokens])

Noise dampening embedding

Sliding window classification

In [36]:
class FastTextClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, window_size):
        super().__init__()
        self.weight = nn.Linear(input_size, hidden_size)
        # self.relu = nn.ReLU()
        self.hidden = nn.Linear(window_size*hidden_size, 2)
        self.dist = nn.Softmax()

    def forward(self, x):
        weights = [self.weight(torch.tensor(n_gram, dtype=torch.float32)) for n_gram in x]
        weights = torch.cat(weights)
        hidden = self.hidden(weights)
        return self.dist(hidden)

    def predict(self, x):
        with torch.no_grad():
            outputs = self(x)
            return np.argmax(outputs.numpy(), axis=0)

In [37]:
def train_model(n_grams, input_size, hidden_size, window_size, epochs=1):
    model = FastTextClassifier(input_size, hidden_size, window_size)
    loss_function = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    current_loss = 0.0
    for e in range(epochs):
        for i, (x, y) in enumerate(n_grams):
            optimizer.zero_grad()

            prediction = model(x)
            print(prediction, torch.tensor(y))
            loss = loss_function(prediction, torch.tensor(y))
            loss.backward()

            optimizer.step()
            current_loss += loss.item()

            if i%100 == 0:
                print(f'Average loss after batch %d: %.3f'%((i/100)+1, current_loss/100))
                current_loss = 0.0
        print(f'Epoch {e+1} finished')

    return model

In [7]:
def classify_n_gram(n_gram_vector, examples_vectors) -> bool:
    for example_vector in examples_vectors:
        # if all items of the example_vector are in n_gram_vector
        if all(any(np.array_equal(item, example) for example in n_gram_vector) for item in example_vector):
            return True
    return False

def split_n_gram(comment, embedding, pad_vector, window_size=5):
    comment_vector = embedding(comment)
    m = len(comment_vector)
    if m < window_size:
        for _ in range(m, window_size):
            comment_vector = np.vstack([comment_vector, pad_vector])
        m = window_size

    i = 0
    n_grams = []
    while i + window_size <= m:
        n_gram = comment_vector[i:i+window_size]
        n_grams.append(n_gram)
        i += 1
    return n_grams

def split_and_classify_n_grams(comment_record, embedding, pad_vector, window_size=5):
    examples_vectors = [embedding(example) for example in comment_record['examples']] if comment_record['examples'] != None else []

    return [{
            'n_gram': n_gram,
            'label': 'p' if classify_n_gram(n_gram, examples_vectors) else 'n'
        }
        for n_gram in split_n_gram(comment_record['comment'], embedding, pad_vector, window_size)
    ]

In [8]:
def split_test_train(comments, test_fraction = 0.1):
    random.seed(42)
    random.shuffle(comments)
    test_count = int(len(comments) * test_fraction)
    test_comments = comments[:test_count]
    train_comments = comments[test_count:]
    return test_comments, train_comments

Training FastText with sub word embedding

In [11]:
testing_set, training_set = split_test_train(supervised_comments)

input_size = 100
embedding = get_sub_word_tokenization_embedding(input_size)

In [None]:
window_size = 5

training_n_gram_set = [
        (n['n_gram'], 0 if n['label'] == 'n' else 1)
        for comment in training_set
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
    ]

test_n_gram_set = [
        (n['n_gram'], 0 if n['label'] == 'n' else 1)
        for comment in testing_set
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
]

model = train_model(training_n_gram_set, input_size, input_size, window_size, epochs=5)

In [42]:
window_size = 5

training_set = [{
            'comment': 'тъпанари',
            'examples': None
        },{
            'comment': 'круши, круши, круши',
            'examples': ['круши']
        }]

training_n_gram_set = [
        (n['n_gram'], 0 if n['label'] == 'n' else 1)
        for comment in training_set
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
    ]

test_n_gram_set = [
        (n['n_gram'], 0 if n['label'] == 'n' else 1)
        for comment in testing_set
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
]

model = train_model(training_n_gram_set, input_size, input_size, window_size, epochs=50)

tensor([0.2798, 0.7202], grad_fn=<SoftmaxBackward0>) tensor(0)
Average loss after batch 1: -0.003
tensor([0.8113, 0.1887], grad_fn=<SoftmaxBackward0>) tensor(0)
tensor([1.0000e+00, 3.4538e-06], grad_fn=<SoftmaxBackward0>) tensor(1)
Epoch 1 finished
tensor([0.7790, 0.2210], grad_fn=<SoftmaxBackward0>) tensor(0)
Average loss after batch 1: -0.016
tensor([9.9963e-01, 3.7247e-04], grad_fn=<SoftmaxBackward0>) tensor(0)
tensor([1.0000e+00, 6.9452e-09], grad_fn=<SoftmaxBackward0>) tensor(1)
Epoch 2 finished
tensor([0.9494, 0.0506], grad_fn=<SoftmaxBackward0>) tensor(0)
Average loss after batch 1: -0.019
tensor([9.9999e-01, 6.5225e-06], grad_fn=<SoftmaxBackward0>) tensor(0)
tensor([1.0000e+00, 1.1897e-10], grad_fn=<SoftmaxBackward0>) tensor(1)
Epoch 3 finished
tensor([0.9857, 0.0143], grad_fn=<SoftmaxBackward0>) tensor(0)
Average loss after batch 1: -0.020
tensor([1.0000e+00, 3.6712e-07], grad_fn=<SoftmaxBackward0>) tensor(0)
tensor([1.0000e+00, 6.4892e-12], grad_fn=<SoftmaxBackward0>) tensor(

Testing FastText with sub word embedding

In [56]:
def predict(model, comment):
    n_grams = split_n_gram(comment, embedding, embedding('[PAD]'))
    return any(model.predict(n_gram) for n_gram in n_grams)

def test_model(model, testing_set):
    p_class = [c['comment'] for c in testing_set if c['label'] == 'p']
    n_class = [c['comment'] for c in testing_set if c['label'] == 'n']
    test_classes = [p_class, n_class]

    confusionMatrix = [[0, 0], [0, 0]]
    for c in range(2):
        for comment in test_classes[c]:
            c_MAP = predict(model, comment)
            confusionMatrix[c][c_MAP] += 1

    sum_positive = sum(confusionMatrix[x][0] for x in range(2))
    precision = confusionMatrix[0][0] / sum_positive
    recall = confusionMatrix[0][0] / len(p_class)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('=================================================================')
    print('Confusion matrix: ')
    for row in confusionMatrix:
        for val in row:
            print('{:4}'.format(val), end = '')
        print()
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))

In [57]:
test_model(model, testing_set)

Confusion matrix: 
  76   0
 204   0
Precision: 0.2714285714285714
Recall: 1.0
F1-score: 0.42696629213483145
