In [1]:
from tokenizers import Tokenizer
from nltk.tokenize import regexp_tokenize
from preprocessing import cyrillize
from sklearn.decomposition import TruncatedSVD

from sklearn import svm

import json
import numpy as np

import random
import math
from math import floor
from get_embeddings import get_sub_word_tokenization_embedding

In [2]:
def classify_n_gram(n_gram_vector, examples_vectors) -> bool:
    for example_vector in examples_vectors:
        # if all items of the example_vector are in n_gram_vector
        if all(any(np.array_equal(row, example) for example in n_gram_vector) for row in example_vector):
            return True
    return False

def split_n_gram(comment, embedding, pad_vector, window_size=5):
    comment_vector = embedding(cyrillize(comment))
    m = len(comment_vector)
    if m == 0:
        return []
    if m < window_size:
        for _ in range(m, window_size):
            comment_vector = np.vstack([comment_vector, pad_vector])
        m = window_size

    i = 0
    n_grams = []
    while i + window_size <= m:
        n_gram = comment_vector[i:i+window_size]
        n_grams.append(n_gram)
        i += 1
    return n_grams

def split_and_classify_n_grams(comment_record, embedding, pad_vector, window_size=5):
    examples_vectors = [embedding(example) for example in comment_record['examples']] if 'examples' in comment_record != None else []

    return [{
            'n_gram': n_gram,
            'label': 'p' if classify_n_gram(n_gram, examples_vectors) else 'n'
        }
        for n_gram in split_n_gram(comment_record['comment'], embedding, pad_vector, window_size)
    ]

def predict(model, embedding, comment):
    n_grams = split_n_gram(comment, embedding, embedding('[PAD]'))
    return any(model.predict(np.concatenate(n_gram).reshape(1, -1)).item() == 1 for n_gram in n_grams)

In [3]:
def train_model(model, train_records, embedding, window_size, balanced_classes, p_n_rate=1.0):
    training_n_gram_set = [
        (np.concatenate(n['n_gram']), 0 if n['label'] == 'n' else 1)
        for comment in train_records
        for n in split_and_classify_n_grams(comment, embedding, embedding('[PAD]'), window_size)
        if len(n['n_gram']) > 0
    ]
    positive_train = [a for a in training_n_gram_set if a[1] == 1]
    negative_train = [a for a in training_n_gram_set if a[1] == 0]
    train_sampled_data = positive_train + negative_train
    if balanced_classes:
        train_sampled_data = negative_train
        train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*len(positive_train)))

    train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
    model.fit(train_x, train_y)

In [4]:
def get_sub_word_tokenization_embedding(dim=100, norm=True):
    tokenizer = Tokenizer.from_file("data/tokenizer_comments.json")
    token2ind = tokenizer.get_vocab()
    ind2token = lambda x: tokenizer.id_to_token(x)

    with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
        unsupervised_comments = json.load(f)

    tokenized_unsupervised_comments = [tokenizer.encode(c.lower()).tokens for c in unsupervised_comments]

    n_words = tokenizer.get_vocab_size()
    X=np.zeros((n_words,n_words))
    for s in ["[UNK]", "[PAD]", "[STR]", "[END]"]:
        X[token2ind[s], token2ind[s]] = 1
    for comment in tokenized_unsupervised_comments:
        for wi in range(len(comment)):
            if comment[wi] not in token2ind: continue
            i=token2ind[comment[wi]]
            for k in range(1,4+1):
                if wi-k>=0 and comment[wi-k] in token2ind:
                    j=token2ind[comment[wi-k]]
                    X[i,j] += 1
                if wi+k<len(comment) and comment[wi+k] in token2ind:
                    j=token2ind[comment[wi+k]]
                    X[i,j] += 1

    svd = TruncatedSVD(n_components=dim, n_iter=10)
    svd.fit(X)
    X_reduced = svd.transform(X)

    return lambda comment: np.stack([X_reduced[token2ind[token]] for token in tokenizer.encode(comment.lower()).tokens])

In [5]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    comments = json.load(f)
supervised_comments = [
    d for d in comments if 'label' in d
]
unsupervised_comments = [
    d for d in comments if 'label' not in d
]

In [6]:
sub_word_embedding = get_sub_word_tokenization_embedding(100)

In [7]:
svc = svm.SVC(probability=True)

train_model(svc, supervised_comments, sub_word_embedding, 5, True)

In [8]:
def max_entropy_in_comment(comment):
    n_grams = np.array([np.concatenate(n) for n in split_n_gram(comment['comment'], sub_word_embedding, sub_word_embedding('[PAD]'), 5)])
    ps = svc.predict_proba(n_grams)
    return max([-p[0]*math.log(p[0]) - p[1]*math.log(p[1]) for p in ps])

In [9]:
unsupervised_comments_tuple = [(c, max_entropy_in_comment(c)) for c in unsupervised_comments]

In [10]:
sorted_unsupervised_comments = [x[0] for x in sorted(unsupervised_comments_tuple, key= lambda x: x[1])]

In [None]:
sorted_unsupervised_comments[44]

In [22]:
sorted_unsupervised_comments[-100]

{'comment': 'Праз. Е и какво от това?', 'author': 'Голям'}

In [13]:
sorted_unsupervised_comments = list(reversed(sorted_unsupervised_comments))
sorted_comments = supervised_comments + sorted_unsupervised_comments

import codecs

json_object = json.dumps(sorted_comments, indent=4, ensure_ascii=False)
with codecs.open("data/blitz_comments_sorted.json", "w", "utf-8") as outfile:
    outfile.write(json_object)