In [1]:
import torch
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize

import json
import random
import math

In [2]:
def get_sentences(f) -> list[str]:
    all = []
    for a in json.load(f):
        sentences = a['content'].lower().split('.')
        all.append(a['name'].lower())
        all += sentences

    return all

In [3]:
with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    unsupervised_comments = json.load(f)

with open('data/blitz_articles.json', 'r', encoding="utf-8") as f:
    sentences = get_sentences(f)

with open('data/dnes_bg_articles.json', 'r', encoding="utf-8") as f:
    sentences += get_sentences(f)

with open('data/pik_articles.json', 'r', encoding="utf-8") as f:
    sentences += get_sentences(f)


unsupervised_corpus = unsupervised_comments + sentences

Creating the supervised corpus

In [4]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{'comment': cyrillize(d['comment']), 'label': d['label']} for d in json.load(f) if 'label' in d]

supervised_p = len([s for s in supervised_comments if s['label'] == 'p'])
supervised_n = len([s for s in supervised_comments if s['label'] == 'p'])

In [5]:
with open('data/bgjargon.json', 'r', encoding="utf-8") as f:
    bgjargon = json.load(f)

with open('data/bad_words_2.json', 'r', encoding="utf-8") as f:
    bad_words = set(json.load(f))

for w in bad_words:
    if w in bgjargon:
        for meaning in bgjargon[w]['meanings']:
            if len(meaning['example']) > 0 and supervised_p < supervised_n:
                supervised_p += 1
                supervised_comments.append({'comment': meaning['example'], 'label': 'p'})

Tokenizing the comments

In [6]:
tokenizer = Tokenizer.from_file("data/tokenizer_comments_bgjargon_articles_end_start.json")
token2ind = tokenizer.get_vocab()
ind2token = tokenizer.id_to_token

In [7]:
unsupervised_corpus = [tokenizer.encode(c.lower()).tokens for c in unsupervised_corpus]
supervised_comments = [{'comment': tokenizer.encode(c['comment'].lower()).tokens, 'label': c['label']} for c in supervised_comments]

In [8]:
def split_comments(supervised_comments, test_fraction = 0.1):
    random.seed(42)
    random.shuffle(supervised_comments)
    test_count = int(len(supervised_comments) * test_fraction)
    test_comments = supervised_comments[:test_count]
    train_comments = supervised_comments[test_count:]
    return test_comments, train_comments

Pre training BiLSTM on the unsuppervised corpus

In [9]:
start_token = '[STR]'
end_token = '[END]'
unk_token = '[UNK]'
pad_token = '[PAD]'

batchSize = 32
emb_size = 50
hid_size = 100

device = torch.device("cuda:0")
test_comments, train_comments  = split_comments(supervised_comments, test_fraction = 0.05)

In [10]:
class BiLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, token2ind, unk_token, pad_token, end_token):
        super(BiLSTMLanguageModelPack, self).__init__()
        self.token2ind = token2ind
        self.unk_token_idx = token2ind[unk_token]
        self.pad_token_idx = token2ind[pad_token]
        self.end_token_idx = token2ind[end_token]
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.embed = torch.nn.Embedding(len(token2ind), embed_size)
        self.projection = torch.nn.Linear(2*hidden_size,len(token2ind))

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        comments = [[self.token2ind.get(w,self.unk_token_idx) for w in s] for s in source]
        comments_padded = [ s+(m-len(s))*[self.pad_token_idx] for s in comments]
        return torch.t(torch.tensor(comments_padded, dtype=torch.long, device=device))

    def forward(self, source):
        batch_size = len(source)
        X = self.preparePaddedBatch(source)
        E = self.embed(X)

        source_lengths = [len(s) for s in source]
        m = X.shape[0]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))

        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        output = output.view(m, batch_size, 2, self.hidden_size)
        # left to right and right to left in shift the prediction to i + 1 and i - 1 for the right to left
        # we shift them to make them predict a single position
        t = torch.cat((output[:-2,:,0,:], output[2:,:,1,:]),2)
        Z = self.projection(t.flatten(0,1))

        Y_bar = X[1:-1].flatten(0,1)
        Y_bar[Y_bar==self.end_token_idx] = self.pad_token_idx
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.pad_token_idx)
        return H

In [11]:
blm = BiLSTMLanguageModelPack(emb_size, hid_size, token2ind, unk_token, pad_token, end_token).to(device)
optimizer = torch.optim.Adam(blm.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(5):
    for b in range(0, len(idx), batchSize):
        batch = [unsupervised_corpus[i] for i in idx[b:min(b+batchSize, len(idx))]]
        H = blm(batch)
        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 100 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 3834 10.310538291931152
0 : 800 / 3834 8.16960334777832
0 : 1600 / 3834 7.5089850425720215
0 : 2400 / 3834 6.998119831085205
0 : 3200 / 3834 7.3599162101745605
1 : 0 / 3834 6.544349670410156
1 : 800 / 3834 6.728677749633789
1 : 1600 / 3834 5.192967414855957
1 : 2400 / 3834 5.2113165855407715
1 : 3200 / 3834 5.614673614501953
2 : 0 / 3834 5.205222129821777
2 : 800 / 3834 5.428623676300049
2 : 1600 / 3834 3.627659559249878
2 : 2400 / 3834 3.7056515216827393
2 : 3200 / 3834 3.785240411758423
3 : 0 / 3834 3.791712522506714
3 : 800 / 3834 4.237063884735107
3 : 1600 / 3834 2.621879816055298
3 : 2400 / 3834 2.8304057121276855
3 : 3200 / 3834 2.746354341506958
4 : 0 / 3834 2.838104009628296
4 : 800 / 3834 3.262881278991699
4 : 1600 / 3834 2.0043952465057373
4 : 2400 / 3834 2.146798610687256
4 : 3200 / 3834 2.059828996658325


In [12]:
def perplexity(lm, test_comments, batchSize):
    H = 0.
    c = 0
    for b in range(0,len(test_comments),batchSize):
        batch = test_comments[b:min(b+batchSize, len(test_comments))]
        l = sum(len(s)-1 for s in batch)
        c += l
        with torch.no_grad():
            H += l * lm(batch)
    return math.exp(H/c)

In [13]:
perplexity(blm, [c['comment'] for c in supervised_comments], batchSize)

3425.637464189045

Fine tuning on the supervised comments

In [14]:
class BiLSTMClassifier(torch.nn.Module):
    def __init__(self, langModel):
        super(BiLSTMClassifier, self).__init__()
        self.langModel = langModel
        self.classProjection = torch.nn.Linear(2*langModel.hidden_size, 2)

    def forward(self, source):
        batch_size = len(source)
        X = self.langModel.preparePaddedBatch(source)
        E = self.langModel.embed(X)
        source_lengths = [len(s) for s in source]
        _, (h,c) = self.langModel.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))
        h = h.view(2,batch_size,self.langModel.hidden_size)

        Z = self.classProjection(torch.cat([h[0],h[1]],1))
        return Z

In [15]:
train_y = np.array([(0 if c['label'] == 'n' else 1) for c in train_comments])
test_y = np.array([(0 if c['label'] == 'n' else 1) for c in test_comments])

idx = np.arange(len(train_comments), dtype='int32')

classModel = BiLSTMClassifier(blm).to(device)
optimizer = torch.optim.Adam(classModel.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(10):
    for b in range(0, len(idx), batchSize):
        batch = [ train_comments[i] for i in idx[b:min(b+batchSize, len(idx))] ]
        target = torch.tensor(train_y[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)

        Z = classModel(batch)
        H = torch.nn.functional.cross_entropy(Z,target)

        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 10 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 3834 0.6981102228164673
0 : 160 / 3834 0.9466759562492371
0 : 320 / 3834 0.434436172246933
0 : 480 / 3834 0.620633602142334
0 : 640 / 3834 0.648489236831665
0 : 800 / 3834 0.6455889344215393
0 : 960 / 3834 0.663154661655426
0 : 1120 / 3834 0.737922191619873
0 : 1280 / 3834 0.5722798109054565
0 : 1440 / 3834 0.6227002143859863
0 : 1600 / 3834 0.5432533025741577
0 : 1760 / 3834 0.6211169958114624
0 : 1920 / 3834 0.6690309047698975
0 : 2080 / 3834 0.6981790065765381
0 : 2240 / 3834 0.6512628793716431
0 : 2400 / 3834 0.6554851531982422
0 : 2560 / 3834 0.6771443486213684
0 : 2720 / 3834 0.664113461971283
0 : 2880 / 3834 0.5335915684700012
0 : 3040 / 3834 0.6502116322517395
0 : 3200 / 3834 0.7770610451698303
0 : 3360 / 3834 0.7043849229812622
0 : 3520 / 3834 0.5553061962127686
0 : 3680 / 3834 0.5256373882293701
1 : 0 / 3834 0.6793232560157776
1 : 160 / 3834 0.5923578143119812
1 : 320 / 3834 0.4622243344783783
1 : 480 / 3834 0.5648067593574524
1 : 640 / 3834 0.643519937992096
1 : 800 

In [16]:
def predict(s, class_model):
    with torch.no_grad():
        Z = class_model([s])
        return torch.argmax(Z[0]).item()

def test_model(model):
    tp, fn, fp, tn = 0, 0, 0, 0
    for comment in test_comments:
        if predict(comment['comment'], model):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    precision = tp/(tp + fp) if (tp + fp) != 0 else 0
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8} {:>8}'.format('Actual p', tp, fn))
    print('{:15} {:>8} {:>8}'.format('Actual n', fp, tn))
    return Fscore

In [17]:
test_model(classModel)

Precision: 0.1111111111111111
Recall: 0.015384615384615385
F1-score: 0.02702702702702703
Confusion Matrix:
                Predicted p Predicted n
Actual p               1       64
Actual n               8      128


0.02702702702702703

In [None]:
for c in supervised_comments:
    if predict(c['comment'], classModel) == 1:
        print(c['label'], c['comment'])