In [1]:
import torch
import torch.nn as nn
import numpy as np
from tokenizers import Tokenizer
from nltk.tokenize import regexp_tokenize
from preprocessing import cyrillize, pattern
from sklearn.decomposition import TruncatedSVD

import json
import random
from math import floor

In [2]:
def classify_n_gram(n_gram_vector, examples_vectors) -> bool:
    for example_vector in examples_vectors:
        # if all items of the example_vector are in n_gram_vector
        if all(any(np.array_equal(row, example) for example in n_gram_vector) for row in example_vector):
            return True
    return False

def split_n_gram(comment, embedding, pad_vector, window_size=5):
    comment_vector = embedding(comment)
    m = len(comment_vector)
    if m == 0:
        return []
    if m < window_size:
        for _ in range(m, window_size):
            comment_vector = np.vstack([comment_vector, pad_vector])
        m = window_size

    i = 0
    n_grams = []
    while i + window_size <= m:
        n_gram = comment_vector[i:i+window_size]
        n_grams.append(n_gram)
        i += 1
    return n_grams

def split_and_classify_n_grams(comment_record, embedding, pad_vector, window_size=5):
    examples_vectors = [embedding(example) for example in comment_record['examples']] if comment_record['examples'] != None else []

    return [{
            'n_gram': n_gram,
            'label': 'p' if classify_n_gram(n_gram, examples_vectors) else 'n'
        }
        for n_gram in split_n_gram(comment_record['comment'], embedding, pad_vector, window_size)
    ]

def predict(model, embedding, comment):
    n_grams = split_n_gram(comment, embedding, embedding('[PAD]'))
    return any(model.predict(np.concatenate(n_gram).reshape(1, -1)).item() == 1 for n_gram in n_grams)

def test_model(model, embedding, testing_set):
    tp, fn, fp, tn = 0, 0, 0, 0
    for comment in testing_set:
        if predict(model, embedding, comment['comment']):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp) if tp + fp > 0 else 0
    recall = tp/(tp + fn) if tp + fn > 0 else 0
    Fscore = (2.0 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def train_model(model, train_records, embedding, window_size, balanced_classes, p_n_rate=1.0):
    training_n_gram_set = [
        (np.concatenate(n['n_gram']), 0 if n['label'] == 'n' else 1)
        for comment in train_records
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
        if len(n['n_gram']) > 0
    ]
    positive_train = [a for a in training_n_gram_set if a[1] == 1]
    negative_train = [a for a in training_n_gram_set if a[1] == 0]
    train_sampled_data = positive_train + negative_train
    if balanced_classes:
        train_sampled_data = negative_train
        train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*len(positive_train)))

    train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
    model.fit(train_x, train_y)

def k_cross_validation(model, supervised_comments, embedding, window_size, k, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]

        train_model(model, train_records, embedding, window_size, balanced_classes, p_n_rate)

        tp, fn, fp, tn = test_model(model, embedding, test_records)
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

Loading data

In [3]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [
        {
            'comment': cyrillize(d['comment']),
            'label': d['label'],
            'examples': d['examples'] if 'examples' in d else None
        }
        for d in json.load(f) if 'label' in d
    ]

In [4]:
def normalize(arr):
    norm = np.linalg.norm(arr)
    return arr/norm if norm != 0 else arr

Sub word tokenization embedding

In [5]:
def get_sub_word_tokenization_embedding(dim=100, norm=True):
    tokenizer = Tokenizer.from_file("data/tokenizer_comments.json")
    token2ind = tokenizer.get_vocab()
    ind2token = lambda x: tokenizer.id_to_token(x)

    with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
        unsupervised_comments = json.load(f)

    tokenized_unsupervised_comments = [tokenizer.encode(c).tokens for c in unsupervised_comments]

    n_words = tokenizer.get_vocab_size()
    X=np.zeros((n_words,n_words))
    for s in ["[UNK]", "[PAD]", "[STR]", "[END]"]:
        X[token2ind[s], token2ind[s]] = 1
    for comment in tokenized_unsupervised_comments:
        for wi in range(len(comment)):
            if comment[wi] not in token2ind: continue
            i=token2ind[comment[wi]]
            for k in range(1,4+1):
                if wi-k>=0 and comment[wi-k] in token2ind:
                    j=token2ind[comment[wi-k]]
                    X[i,j] += 1
                if wi+k<len(comment) and comment[wi+k] in token2ind:
                    j=token2ind[comment[wi+k]]
                    X[i,j] += 1

    svd = TruncatedSVD(n_components=dim, n_iter=10)
    svd.fit(X)
    X_reduced = svd.transform(X)

    return lambda comment: np.stack([normalize(X_reduced[token2ind[token]]) for token in tokenizer.encode(comment).tokens])

Noise dampening embedding

In [6]:
def get_noise_dampening_embedding(dim, device):
    class EncoderRNN(nn.Module):
        def __init__(self, input_size, hidden_size, dropout_p=0.1):
            super(EncoderRNN, self).__init__()
            self.hidden_size = hidden_size

            self.embedding = nn.Embedding(input_size, hidden_size)
            self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
            self.dropout = nn.Dropout(dropout_p)

        def forward(self, input):
            embedded = self.dropout(self.embedding(input))
            output, hidden = self.gru(embedded)
            return output, hidden

        def save(self, filepath):
            torch.save(self.state_dict(), filepath)

        @classmethod
        def load(cls, filepath, input_size, hidden_size, dropout_p=0.1):
            model = cls(input_size, hidden_size, dropout_p)
            model.load_state_dict(torch.load(filepath))
            return model

    SOW_token = ''
    EOW_token = ''
    UNK_token = '�'

    alphabet_for_generation = 'абвгдежзийклмнопрстуфхцчшщьъюяabcdefghijklmnopqrstuvwxyz!@#$%^&*()-_=+[]\';.,/`~"<>|1234567890'
    alphabet = alphabet_for_generation
    alphabet += SOW_token
    alphabet += EOW_token
    alphabet += UNK_token

    char2ind = {}
    for i, c in enumerate(alphabet):
        char2ind[c] = i

    def indexesFromWord(word):
        return [(char2ind[c] if c in char2ind else char2ind[UNK_token]) for c in word]

    def tensorFromWord(word):
        indexes = indexesFromWord(word)
        indexes.append(char2ind[EOW_token])
        return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

    encoder = EncoderRNN.load("data/embedding_encoder_100_000_smaller_alphabet.pth", 96, 128)
    encoder.to(device)
    encoder.eval()

    def embedding(word):
        if word == '[PAD]':
            return torch.zeros(1, 1, 128)
        with torch.no_grad():
            input_tensor = tensorFromWord(word)
            _, encoder_hidden = encoder(input_tensor)
        return encoder_hidden

    # reducing dims with svd on the unsupervised comments
    with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
        unsupervised_comments = json.load(f)
    vocabulary = set([t for comment in unsupervised_comments for t in regexp_tokenize(comment, pattern)])
    X = np.vstack([embedding(t).cpu().flatten() for t in vocabulary])
    svd = TruncatedSVD(n_components=dim, n_iter=10)
    svd.fit(X)

    def comment_embedding(comment):
        tokens = [
            normalize(svd.transform(embedding(word).cpu().flatten(end_dim=1)).flatten())
            for word in regexp_tokenize(comment, pattern)
        ]
        if len(tokens) != 0:
            return np.vstack(tokens)
        return []

    return comment_embedding

In [7]:
sub_word_embedding = get_sub_word_tokenization_embedding(dim=100)

In [8]:
noise_dampening_embedding = get_noise_dampening_embedding(dim=100, device='cuda')

Gradient Boosting Classifier

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [10]:
print("Gradient boosted stumps with sub word embedding")
print_test_model(*k_cross_validation(clf, supervised_comments, sub_word_embedding, window_size=5, k=10, balanced_classes=False))

Gradient boosted stumps with sub word embedding


In [None]:
print("Gradient boosted stumps with ND embedding")
print_test_model(*k_cross_validation(clf, supervised_comments, noise_dampening_embedding, window_size=5, k=10, balanced_classes=False))

Gradient boosted stumps with ND embedding
Precision: 0.5535714285714286
Recall: 0.037851037851037855
F1-score: 0.07085714285714287
Confusion Matrix:
                Predicted p Predicted n
Actual p           1.824   46.353
Actual n           1.471  131.353


0.07085714285714287

K-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
best_i, best_f = 1, 0
for i in range(1, 8, 2):
    j = len(supervised_comments)//10
    test_records = supervised_comments[0:j]
    train_records = supervised_comments[j:]
    model = KNeighborsClassifier(i)
    train_model(model, train_records, sub_word_embedding, window_size=5, balanced_classes=False)

    f = print_test_model(*test_model(model, sub_word_embedding, test_records))
    if f > best_f:
        best_f = f
        best_i = i

Precision: 0.5135135135135135
Recall: 0.336283185840708
F1-score: 0.40641711229946526
Confusion Matrix:
                Predicted p Predicted n
Actual p          38.000   75.000
Actual n          36.000  158.000
Precision: 0.5925925925925926
Recall: 0.2831858407079646
F1-score: 0.38323353293413176
Confusion Matrix:
                Predicted p Predicted n
Actual p          32.000   81.000
Actual n          22.000  172.000
Precision: 0.44
Recall: 0.19469026548672566
F1-score: 0.26993865030674846
Confusion Matrix:
                Predicted p Predicted n
Actual p          22.000   91.000
Actual n          28.000  166.000
Precision: 0.6666666666666666
Recall: 0.10619469026548672
F1-score: 0.183206106870229
Confusion Matrix:
                Predicted p Predicted n
Actual p          12.000  101.000
Actual n           6.000  188.000


In [None]:
print("K-NN with sub word embedding")
print_test_model(*k_cross_validation(KNeighborsClassifier(best_i), supervised_comments, sub_word_embedding, window_size=5, k=10, balanced_classes=False))

K-NN with sub word embedding
Precision: 0.391820580474934
Recall: 0.3626373626373626
F1-score: 0.3766645529486366
Confusion Matrix:
                Predicted p Predicted n
Actual p          17.471   30.706
Actual n          27.118  105.706


0.3766645529486366

In [None]:
best_i, best_f = 1, 0.0
for i in range(1, 8, 2):
    j = len(supervised_comments)//10
    test_records = supervised_comments[0:j]
    train_records = supervised_comments[j:]
    model = KNeighborsClassifier(i)
    train_model(model, train_records, noise_dampening_embedding, window_size=5, balanced_classes=False)

    f = print_test_model(*test_model(model, noise_dampening_embedding, test_records))
    if f > best_f:
        best_f = f
        best_i = i

Precision: 0.45454545454545453
Recall: 0.35398230088495575
F1-score: 0.39800995024875624
Confusion Matrix:
                Predicted p Predicted n
Actual p          40.000   73.000
Actual n          48.000  146.000
Precision: 0.4691358024691358
Recall: 0.336283185840708
F1-score: 0.3917525773195876
Confusion Matrix:
                Predicted p Predicted n
Actual p          38.000   75.000
Actual n          43.000  151.000
Precision: 0.36507936507936506
Recall: 0.20353982300884957
F1-score: 0.26136363636363635
Confusion Matrix:
                Predicted p Predicted n
Actual p          23.000   90.000
Actual n          40.000  154.000
Precision: 0.4838709677419355
Recall: 0.13274336283185842
F1-score: 0.20833333333333334
Confusion Matrix:
                Predicted p Predicted n
Actual p          15.000   98.000
Actual n          16.000  178.000


In [None]:
print("K-NN with ND embedding")
print_test_model(*k_cross_validation(KNeighborsClassifier(best_i), supervised_comments, noise_dampening_embedding, window_size=5, k=10, balanced_classes=False))

K-NN with ND embedding
Precision: 0.3282520325203252
Recall: 0.3943833943833944
F1-score: 0.35829173599556297
Confusion Matrix:
                Predicted p Predicted n
Actual p          19.000   29.176
Actual n          38.882   93.941


0.35829173599556297

Multi layer perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
print("MLP with sub word embedding")
print_test_model(*k_cross_validation(MLPClassifier((500, 500, 2), activation='relu'), supervised_comments, sub_word_embedding, window_size=5, k=10, balanced_classes=True, p_n_rate=1))

In [None]:
print("MLP with ND embedding")
print_test_model(*k_cross_validation(MLPClassifier((500, 500, 2), activation='relu'), supervised_comments, noise_dampening_embedding, window_size=5, k=10, balanced_classes=True, p_n_rate=1))

SVM

In [9]:
from sklearn import svm

In [None]:
print("SVC with sub word embedding")
print_test_model(*k_cross_validation(svm.SVC(), supervised_comments, sub_word_embedding, window_size=5, k=10, balanced_classes=True, p_n_rate=1))

In [None]:
print("SVC with ND embedding")
print_test_model(*k_cross_validation(svm.SVC(), supervised_comments, noise_dampening_embedding, window_size=5, k=10, balanced_classes=True, p_n_rate=1))