In [110]:
import torch
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize

import json
import random
import math

In [111]:
with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    unsupervised_comments = json.load(f)

In [114]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{'comment': cyrillize(d['comment']), 'label': d['label']} for d in json.load(f) if 'label' in d]

In [115]:
with open('data/bgjargon.json', 'r', encoding="utf-8") as f:
    bgjargon = json.load(f)

with open('data/bad_words_2.json', 'r', encoding="utf-8") as f:
    bad_words = set(json.load(f))

for w in bad_words:
    if w in bgjargon:
        for meaning in bgjargon[w]['meanings']:
            if len(meaning['example']) > 0:
                supervised_comments.append({'comment': meaning['example'], 'label': 'p'})

In [116]:
for w in bad_words:
    if w in bgjargon:
        for meaning in bgjargon[w]['meanings']:
            if len(meaning['example']) > 0:
                supervised_comments.append({'comment': meaning['example'], 'label': 'p'})

Tokenizing the comments

In [117]:
tokenizer = Tokenizer.from_file("data/tokenizer_blitz_comments.json")
token2ind = tokenizer.get_vocab()
ind2token = tokenizer.id_to_token

In [120]:
unsupervised_comments = [tokenizer.encode(c).tokens for c in unsupervised_comments]
supervised_comments = [{'comment': tokenizer.encode(c['comment']).tokens, 'label': c['label']} for c in supervised_comments]

In [121]:
def split_comments(supervised_comments, test_fraction = 0.1):
    random.seed(42)
    random.shuffle(supervised_comments)
    test_count = int(len(supervised_comments) * test_fraction)
    test_comments = supervised_comments[:test_count]
    train_comments = supervised_comments[test_count:]
    return test_comments, train_comments

Pre training BiLSTM on the unsuppervised comments

In [122]:
start_token = '[STR]'
end_token = '[END]'
unk_token = '[UNK]'
pad_token = '[PAD]'

batchSize = 32
emb_size = 50
hid_size = 100

device = torch.device("cuda:0")
test_comments, train_comments  = split_comments(supervised_comments, test_fraction = 0.05)

In [123]:
class BiLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, token2ind, unk_token, pad_token, end_token):
        super(BiLSTMLanguageModelPack, self).__init__()
        self.token2ind = token2ind
        self.unk_token_idx = token2ind[unk_token]
        self.pad_token_idx = token2ind[pad_token]
        self.end_token_idx = token2ind[end_token]
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.embed = torch.nn.Embedding(len(token2ind), embed_size)
        self.projection = torch.nn.Linear(2*hidden_size,len(token2ind))

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        comments = [[self.token2ind.get(w,self.unk_token_idx) for w in s] for s in source]
        comments_padded = [ s+(m-len(s))*[self.pad_token_idx] for s in comments]
        return torch.t(torch.tensor(comments_padded, dtype=torch.long, device=device))

    def forward(self, source):
        batch_size = len(source)
        X = self.preparePaddedBatch(source)
        E = self.embed(X)

        source_lengths = [len(s) for s in source]
        m = X.shape[0]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))

        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        output = output.view(m, batch_size, 2, self.hidden_size)
        # left to right and right to left in shift the prediction to i + 1 and i - 1 for the right to left
        # we shift them to make them predict a single position
        t = torch.cat((output[:-2,:,0,:], output[2:,:,1,:]),2)
        Z = self.projection(t.flatten(0,1))

        Y_bar = X[1:-1].flatten(0,1)
        Y_bar[Y_bar==self.end_token_idx] = self.pad_token_idx
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.pad_token_idx)
        return H

In [124]:
blm = BiLSTMLanguageModelPack(emb_size, hid_size, token2ind, unk_token, pad_token, end_token).to(device)
optimizer = torch.optim.Adam(blm.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(1):
    for b in range(0, len(idx), batchSize):
        batch = [unsupervised_comments[i] for i in idx[b:min(b+batchSize, len(idx))]]
        H = blm(batch)
        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 100 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 8454 10.309561729431152
0 : 800 / 8454 8.209003448486328
0 : 1600 / 8454 8.022171020507812
0 : 2400 / 8454 7.693704605102539
0 : 3200 / 8454 7.9185285568237305
0 : 4000 / 8454 7.800654411315918
0 : 4800 / 8454 7.256013870239258
0 : 5600 / 8454 7.4847869873046875
0 : 6400 / 8454 6.6455397605896
0 : 7200 / 8454 6.992987632751465
0 : 8000 / 8454 7.698992729187012


In [100]:
def perplexity(lm, test_comments, batchSize):
    H = 0.
    c = 0
    for b in range(0,len(test_comments),batchSize):
        batch = test_comments[b:min(b+batchSize, len(test_comments))]
        l = sum(len(s)-1 for s in batch)
        c += l
        with torch.no_grad():
            H += l * lm(batch)
    return math.exp(H/c)

In [None]:
perplexity(blm, [c['comment'] for c in supervised_comments], batchSize)

Fine tuning on the supervised comments

In [125]:
class BiLSTMClassifier(torch.nn.Module):
    def __init__(self, langModel):
        super(BiLSTMClassifier, self).__init__()
        self.langModel = langModel
        self.classProjection = torch.nn.Linear(2*langModel.hidden_size, 2)

    def forward(self, source):
        batch_size = len(source)
        X = self.langModel.preparePaddedBatch(source)
        E = self.langModel.embed(X)
        source_lengths = [len(s) for s in source]
        _, (h,c) = self.langModel.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))
        h = h.view(2,batch_size,self.langModel.hidden_size)

        Z = self.classProjection(torch.cat([h[0],h[1]],1))
        return Z

In [126]:
train_y = np.array([(0 if c['label'] == 'n' else 1) for c in train_comments])
test_y = np.array([(0 if c['label'] == 'n' else 1) for c in test_comments])

idx = np.arange(len(train_comments), dtype='int32')

classModel = BiLSTMClassifier(blm).to(device)
optimizer = torch.optim.Adam(classModel.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(2):
    for b in range(0, len(idx), batchSize):
        batch = [ train_comments[i] for i in idx[b:min(b+batchSize, len(idx))] ]
        target = torch.tensor(train_y[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)

        Z = classModel(batch)
        H = torch.nn.functional.cross_entropy(Z,target)

        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 10 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 8454 0.6200783848762512
0 : 160 / 8454 0.4433554410934448
0 : 320 / 8454 0.3966430127620697
0 : 480 / 8454 0.5960900783538818
0 : 640 / 8454 0.6220064759254456
0 : 800 / 8454 0.49255114793777466
0 : 960 / 8454 0.5432082414627075
0 : 1120 / 8454 0.4825776219367981
0 : 1280 / 8454 0.4742678701877594
0 : 1440 / 8454 0.7330568432807922
0 : 1600 / 8454 0.6405234336853027
0 : 1760 / 8454 0.4925477206707001
0 : 1920 / 8454 0.7005985379219055
0 : 2080 / 8454 0.626510500907898
0 : 2240 / 8454 0.6506465673446655
0 : 2400 / 8454 0.38680770993232727
0 : 2560 / 8454 0.6068079471588135
0 : 2720 / 8454 0.5670986771583557
0 : 2880 / 8454 0.47013649344444275
0 : 3040 / 8454 0.44266870617866516
0 : 3200 / 8454 0.5278813242912292
0 : 3360 / 8454 0.5282416939735413
0 : 3520 / 8454 0.5254775285720825
0 : 3680 / 8454 0.37906309962272644
0 : 3840 / 8454 0.5271828770637512
0 : 4000 / 8454 0.6645156741142273
0 : 4160 / 8454 0.5254589319229126
0 : 4320 / 8454 0.5253879427909851
0 : 4480 / 8454 0.7327404

In [130]:
p_class = [c['comment'] for c in test_comments if c['label'] == 'p']
n_class = [c['comment'] for c in test_comments if c['label'] == 'n']
test_classes = [p_class, n_class]

def predict(s, class_model):
    with torch.no_grad():
        Z = class_model([s])
        return torch.argmax(Z[0]).item()

def testClassifier(class_model):
    confusionMatrix = [[0, 0], [0, 0]]
    for c in range(2):
        for text in test_classes[c]:
            c_MAP = predict(text, class_model)
            confusionMatrix[c][c_MAP] += 1

    for row in confusionMatrix:
        for val in row:
            print('{:4}'.format(val), end = '')
        print()

    sum_positive = sum(confusionMatrix[x][0] for x in range(2))
    precision = confusionMatrix[0][0] / sum_positive
    recall = confusionMatrix[0][0] / len(p_class)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('=================================================================')
    print('Confusion matrix: ')
    for row in confusionMatrix:
        for val in row:
            print('{:4}'.format(val), end = '')
        print()
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))

In [131]:
testClassifier(classModel)

   0 333
   0 111


In [129]:
for c in supervised_comments:
    if predict(c['comment'], classModel) == 1:
        print(c['label'], c['comment'])

p ['[STR]', '1', ')', 'глей', 'го', 'бай', 'ван', '–', 'с', 'кле', 'цом', 'оби', 'ла', 'ги', 'мина', 'всичките', 'шунди', 'в', 'квартала', '!', ';', '[END]']
p ['[STR]', "к'во", 'си', 'я', 'олаби', 'я', 'тая', 'лан', 'га,', 'бе!', '[END]']
n ['[STR]', 'тази', 'година', 'грим', 'йор', 'ите', 'се', 'справят', 'перфектно', '.', 'много', 'по', 'добре', 'от', 'минали', 'години.', 'ако', 'се', 'дължи', 'на', 'нея-', 'браво.', '-', '[END]']
p ['[STR]', 'не', 'му', 'обръщай', 'внимание', 'на', 'сашо.', 'от', 'него', 'по-голям', 'дуду', 'к', 'чия', 'няма.', '[END]']
p ['[STR]', 'тоя', 'е', 'едно', 'голямо', 'пу', 'фи', '.', '[END]']
n ['[STR]', 'пък', 'аз', 'се', 'надява', 'х,', 'че', 'се', 'е', 'съ', 'бляк', 'ъл', 'гол', '...', ';)', '[END]']
n ['[STR]', 'сми', 'лете', 'се', 'над', 'деси!', 'и', 'тя', 'сигурно', 'не', 'понася', 'тоя', 'разврат', 'ен', 'тип!', 'бог', 'да', 'и', 'е', 'на', 'помощ', '!', '[END]']
p ['[STR]', 'щ', 'та', 'нак', 'ва', 'ца', 'м', 'кат', 'на', 'ганьо', 'турци', 'с

KeyboardInterrupt: 