In [1]:
import torch
import torch.nn as nn
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize
from sklearn.decomposition import TruncatedSVD

import json
import random
import math

In [2]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [
        {
            'comment': cyrillize(d['comment']),
            'label': d['label'],
            'examples': d['examples'] if 'examples' in d else None
        }
        for d in json.load(f) if 'label' in d
    ]

n_supervised_p = len([s for s in supervised_comments if s['label'] == 'p'])
n_supervised_n = len([s for s in supervised_comments if s['label'] == 'p'])

Sub word tokenization embedding

In [3]:
def get_sub_word_tokenization_embedding(dim=100):
    tokenizer = Tokenizer.from_file("data/tokenizer_comments.json")
    token2ind = tokenizer.get_vocab()
    ind2token = lambda x: tokenizer.id_to_token(x)

    with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
        unsupervised_comments = json.load(f)

    tokenized_unsupervised_comments = [tokenizer.encode(c).tokens for c in unsupervised_comments]

    n_words = tokenizer.get_vocab_size()
    X=np.zeros((n_words,n_words))
    for s in ["[UNK]", "[PAD]", "[STR]", "[END]"]:
        X[token2ind[s], token2ind[s]] = 1
    for comment in tokenized_unsupervised_comments:
        for wi in range(len(comment)):
            if comment[wi] not in token2ind: continue
            i=token2ind[comment[wi]]
            for k in range(1,4+1):
                if wi-k>=0 and comment[wi-k] in token2ind:
                    j=token2ind[comment[wi-k]]
                    X[i,j] += 1
                if wi+k<len(comment) and comment[wi+k] in token2ind:
                    j=token2ind[comment[wi+k]]
                    X[i,j] += 1

    svd = TruncatedSVD(n_components=dim, n_iter=10)
    svd.fit(X)
    X_reduced = svd.transform(X)

    return lambda comment: np.stack([X_reduced[token2ind[token]] for token in tokenizer.encode(comment).tokens])

In [35]:
def classify_n_gram(n_gram_vector, examples_vectors) -> bool:
    for example_vector in examples_vectors:
        # if all items of the example_vector are in n_gram_vector
        if all(any(np.array_equal(row, example) for example in n_gram_vector) for row in example_vector):
            return True
    return False

def split_n_gram(comment, embedding, pad_vector, window_size=5):
    comment_vector = embedding(comment)
    m = len(comment_vector)
    if m < window_size:
        for _ in range(m, window_size):
            comment_vector = np.vstack([comment_vector, pad_vector])
        m = window_size

    i = 0
    n_grams = []
    while i + window_size <= m:
        n_gram = comment_vector[i:i+window_size]
        n_grams.append(n_gram)
        i += 1
    return n_grams

def split_and_classify_n_grams(comment_record, embedding, pad_vector, window_size=5):
    examples_vectors = [embedding(example) for example in comment_record['examples']] if comment_record['examples'] != None else []

    return [{
            'n_gram': n_gram,
            'label': 'p' if classify_n_gram(n_gram, examples_vectors) else 'n'
        }
        for n_gram in split_n_gram(comment_record['comment'], embedding, pad_vector, window_size)
    ]

In [36]:
def split_test_train(comments, test_fraction = 0.1):
    random.seed(42)
    random.shuffle(comments)
    test_count = int(len(comments) * test_fraction)
    test_comments = comments[:test_count]
    train_comments = comments[test_count:]
    return test_comments, train_comments

### Training with sub word embedding

In [8]:
testing_set, training_set = split_test_train(supervised_comments)

input_size = 100
embedding = get_sub_word_tokenization_embedding(input_size)

In [37]:
window_size = 5

training_n_gram_set = [
        (np.concatenate(n['n_gram']), 0 if n['label'] == 'n' else 1)
        for comment in training_set
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
    ]

test_n_gram_set = [
        (np.concatenate(n['n_gram']), 0 if n['label'] == 'n' else 1)
        for comment in testing_set
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
]

positive_train = [a for a in training_n_gram_set if a[1] == 1]
negative_train = [a for a in training_n_gram_set if a[1] == 0]

n_positive = len(positive_train)
n_negative = len(negative_train)

In [56]:
from math import floor


def predict(model, comment):
    n_grams = split_n_gram(comment, embedding, embedding('[PAD]'))
    return any(model.predict(np.concatenate(n_gram).reshape(1, -1)).item() == 1 for n_gram in n_grams)

def test_model(model, testing_set):
    tp, fn, fp, tn = 0, 0, 0, 0
    for comment in testing_set:
        if predict(model, comment['comment']):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8} {:>8}'.format('Actual p', tp, fn))
    print('{:15} {:>8} {:>8}'.format('Actual n', fp, tn))
    return Fscore

def fit_and_test_model(model, balanced_classes: bool = False, p_n_rate = 1.0):
    train_sampled_data = positive_train + negative_train
    if balanced_classes:
        train_sampled_data = negative_train
        train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*n_positive))

    train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
    # test_x, test_y = [a[0] for a in test_n_gram_set], [a[1] for a in test_n_gram_set]

    model.fit(train_x, train_y)
    return test_model(model, testing_set)

Gradient Boosting Classifier

In [58]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [59]:
fit_and_test_model(clf, balanced_classes=False)

Precision: 0.5833333333333334
Recall: 0.3465346534653465
F1-score: 0.43478260869565216
Confusion Matrix:
                Predicted p Predicted n
Actual p              35       66
Actual n              25      168


0.43478260869565216

In [None]:
fit_and_test_model(clf, balanced_classes=True)

K-NN

In [61]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
best_i, best_f = 1, 0.41
for i in range(3, 15, 2):
    f = fit_and_test_model(KNeighborsClassifier(i), balanced_classes=False)
    if f > best_f:
        best_f = f
        best_i = i

In [70]:
fit_and_test_model(KNeighborsClassifier(best_i), balanced_classes=False)

Precision: 0.3681592039800995
Recall: 0.7326732673267327
F1-score: 0.4900662251655628
Confusion Matrix:
                Predicted p Predicted n
Actual p              74       27
Actual n             127       66


0.4900662251655628

In [46]:
fit_and_test_model(clf, balanced_classes=True, p_n_rate=p_n_rate)

Confusion matrix: 
  91  10
 191   2
Precision: 0.32269503546099293
Recall: 0.900990099009901
F1-score: 0.4751958224543082


0.4751958224543082

Multi layer perceptron

In [47]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier((500, 500, 2), activation='relu')

In [None]:
fit_and_test_model(mlp, balanced_classes=True, p_n_rate=1)

In [71]:
test_model(mlp, testing_set)

Precision: 0.46601941747572817
Recall: 0.4752475247524752
F1-score: 0.47058823529411764
Confusion Matrix:
                Predicted p Predicted n
Actual p              48       53
Actual n              55      138


0.47058823529411764

## Noise Dampening Embedding