In [1]:
import torch
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize

import json
import random
import math

In [2]:
def get_sentences(f) -> list[str]:
    all = []
    for a in json.load(f):
        sentences = a['content'].lower().split('.')
        all.append(a['name'].lower())
        all += sentences

    return all

In [3]:
with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    unsupervised_comments = json.load(f)

with open('data/blitz_articles.json', 'r', encoding="utf-8") as f:
    sentences = get_sentences(f)

with open('data/dnes_bg_articles.json', 'r', encoding="utf-8") as f:
    sentences += get_sentences(f)

with open('data/pik_articles.json', 'r', encoding="utf-8") as f:
    sentences += get_sentences(f)


unsupervised_corpus = unsupervised_comments + sentences

Creating the supervised corpus

In [4]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{'comment': cyrillize(d['comment']), 'label': d['label']} for d in json.load(f) if 'label' in d]

supervised_p = len([s for s in supervised_comments if s['label'] == 'p'])
supervised_n = len([s for s in supervised_comments if s['label'] == 'p'])

In [5]:
with open('data/bgjargon.json', 'r', encoding="utf-8") as f:
    bgjargon = json.load(f)

with open('data/bad_words_2.json', 'r', encoding="utf-8") as f:
    bad_words = set(json.load(f))

for w in bad_words:
    if w in bgjargon:
        for meaning in bgjargon[w]['meanings']:
            if len(meaning['example']) > 0 and supervised_p < supervised_n:
                supervised_p += 1
                supervised_comments.append({'comment': meaning['example'], 'label': 'p'})

Tokenizing the comments

In [6]:
tokenizer = Tokenizer.from_file("data/tokenizer_comments_bgjargon_articles_end_start.json")
token2ind = tokenizer.get_vocab()
ind2token = tokenizer.id_to_token

In [7]:
unsupervised_corpus = [tokenizer.encode(c.lower()).tokens for c in unsupervised_corpus]
supervised_comments = [{'comment': tokenizer.encode(c['comment'].lower()).tokens, 'label': c['label']} for c in supervised_comments]

In [8]:
def split_comments(supervised_comments, test_fraction = 0.1):
    random.seed(42)
    random.shuffle(supervised_comments)
    test_count = int(len(supervised_comments) * test_fraction)
    test_comments = supervised_comments[:test_count]
    train_comments = supervised_comments[test_count:]
    return test_comments, train_comments

Pre training BiLSTM on the unsuppervised corpus

In [9]:
start_token = '[STR]'
end_token = '[END]'
unk_token = '[UNK]'
pad_token = '[PAD]'

batchSize = 32
emb_size = 50
hid_size = 100

device = torch.device("cuda:0")
test_comments, train_comments  = split_comments(supervised_comments, test_fraction = 0.05)

In [10]:
class BiLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, token2ind, unk_token, pad_token, end_token):
        super(BiLSTMLanguageModelPack, self).__init__()
        self.token2ind = token2ind
        self.unk_token_idx = token2ind[unk_token]
        self.pad_token_idx = token2ind[pad_token]
        self.end_token_idx = token2ind[end_token]
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.embed = torch.nn.Embedding(len(token2ind), embed_size)
        self.projection = torch.nn.Linear(2*hidden_size,len(token2ind))

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        comments = [[self.token2ind.get(w,self.unk_token_idx) for w in s] for s in source]
        comments_padded = [ s+(m-len(s))*[self.pad_token_idx] for s in comments]
        return torch.t(torch.tensor(comments_padded, dtype=torch.long, device=device))

    def forward(self, source):
        batch_size = len(source)
        X = self.preparePaddedBatch(source)
        E = self.embed(X)

        source_lengths = [len(s) for s in source]
        m = X.shape[0]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))

        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        output = output.view(m, batch_size, 2, self.hidden_size)
        # left to right and right to left in shift the prediction to i + 1 and i - 1 for the right to left
        # we shift them to make them predict a single position
        t = torch.cat((output[:-2,:,0,:], output[2:,:,1,:]),2)
        Z = self.projection(t.flatten(0,1))

        Y_bar = X[1:-1].flatten(0,1)
        Y_bar[Y_bar==self.end_token_idx] = self.pad_token_idx
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.pad_token_idx)
        return H

In [11]:
blm = BiLSTMLanguageModelPack(emb_size, hid_size, token2ind, unk_token, pad_token, end_token).to(device)
optimizer = torch.optim.Adam(blm.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(5):
    for b in range(0, len(idx), batchSize):
        batch = [unsupervised_corpus[i] for i in idx[b:min(b+batchSize, len(idx))]]
        H = blm(batch)
        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 100 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 3454 10.315394401550293
0 : 800 / 3454 7.925639629364014
0 : 1600 / 3454 7.846607208251953
0 : 2400 / 3454 7.593648433685303
0 : 3200 / 3454 7.320993900299072
1 : 0 / 3454 6.6537322998046875
1 : 800 / 3454 6.3167009353637695
1 : 1600 / 3454 6.385562896728516
1 : 2400 / 3454 6.08720064163208
1 : 3200 / 3454 5.793009281158447
2 : 0 / 3454 5.1672682762146
2 : 800 / 3454 4.7709832191467285
2 : 1600 / 3454 4.687723636627197
2 : 2400 / 3454 4.256646633148193
2 : 3200 / 3454 4.119178295135498
3 : 0 / 3454 3.334012031555176
3 : 800 / 3454 3.4363796710968018
3 : 1600 / 3454 3.4834465980529785
3 : 2400 / 3454 3.114901542663574
3 : 3200 / 3454 3.0795207023620605
4 : 0 / 3454 2.29461407661438
4 : 800 / 3454 2.4076406955718994
4 : 1600 / 3454 2.697399377822876
4 : 2400 / 3454 2.3358359336853027
4 : 3200 / 3454 2.3799140453338623


In [12]:
def perplexity(lm, test_comments, batchSize):
    H = 0.
    c = 0
    for b in range(0,len(test_comments),batchSize):
        batch = test_comments[b:min(b+batchSize, len(test_comments))]
        l = sum(len(s)-1 for s in batch)
        c += l
        with torch.no_grad():
            H += l * lm(batch)
    return math.exp(H/c)

In [13]:
perplexity(blm, [c['comment'] for c in supervised_comments], batchSize)

3062.1473010399154

Fine tuning on the supervised comments

In [14]:
class BiLSTMClassifier(torch.nn.Module):
    def __init__(self, langModel):
        super(BiLSTMClassifier, self).__init__()
        self.langModel = langModel
        self.classProjection = torch.nn.Linear(2*langModel.hidden_size, 2)

    def forward(self, source):
        batch_size = len(source)
        X = self.langModel.preparePaddedBatch(source)
        E = self.langModel.embed(X)
        source_lengths = [len(s) for s in source]
        _, (h,c) = self.langModel.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))
        h = h.view(2,batch_size,self.langModel.hidden_size)

        Z = self.classProjection(torch.cat([h[0],h[1]],1))
        return Z

In [15]:
train_y = np.array([(0 if c['label'] == 'n' else 1) for c in train_comments])
test_y = np.array([(0 if c['label'] == 'n' else 1) for c in test_comments])

idx = np.arange(len(train_comments), dtype='int32')

classModel = BiLSTMClassifier(blm).to(device)
optimizer = torch.optim.Adam(classModel.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(10):
    for b in range(0, len(idx), batchSize):
        batch = [ train_comments[i] for i in idx[b:min(b+batchSize, len(idx))] ]
        target = torch.tensor(train_y[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)

        Z = classModel(batch)
        H = torch.nn.functional.cross_entropy(Z,target)

        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 10 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 3454 0.6660460829734802
0 : 160 / 3454 0.6398159861564636
0 : 320 / 3454 0.6253937482833862
0 : 480 / 3454 0.6303751468658447
0 : 640 / 3454 0.8149018287658691
0 : 800 / 3454 0.6577979326248169
0 : 960 / 3454 0.6757951378822327
0 : 1120 / 3454 0.7374864816665649
0 : 1280 / 3454 0.5801318883895874
0 : 1440 / 3454 0.7486447095870972
0 : 1600 / 3454 0.5848121643066406
0 : 1760 / 3454 0.6211056113243103
0 : 1920 / 3454 0.8195732831954956
0 : 2080 / 3454 0.6773115992546082
0 : 2240 / 3454 0.5846858620643616
0 : 2400 / 3454 0.6723805665969849
0 : 2560 / 3454 0.6226780414581299
0 : 2720 / 3454 0.6829150915145874
0 : 2880 / 3454 0.6158624887466431
0 : 3040 / 3454 0.6295149922370911
0 : 3200 / 3454 0.5630736947059631
0 : 3360 / 3454 0.6908990740776062
1 : 0 / 3454 0.6782326102256775
1 : 160 / 3454 0.621475875377655
1 : 320 / 3454 0.5641404986381531
1 : 480 / 3454 0.6225164532661438
1 : 640 / 3454 0.7401736974716187
1 : 800 / 3454 0.5941594839096069
1 : 960 / 3454 0.6754659414291382
1 : 

In [16]:
def predict(s, class_model):
    with torch.no_grad():
        Z = class_model([s])
        return torch.argmax(Z[0]).item()

def test_model(model):
    tp, fn, fp, tn = 0, 0, 0, 0
    for comment in test_comments:
        if predict(comment['comment'], model):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    precision = tp/(tp + fp) if (tp + fp) != 0 else 0
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8} {:>8}'.format('Actual p', tp, fn))
    print('{:15} {:>8} {:>8}'.format('Actual n', fp, tn))
    return Fscore

In [17]:
test_model(classModel)

Precision: 0.3684210526315789
Recall: 0.45161290322580644
F1-score: 0.40579710144927533
Confusion Matrix:
                Predicted p Predicted n
Actual p              28       34
Actual n              48       71


0.40579710144927533

In [None]:
for c in supervised_comments:
    if predict(c['comment'], classModel) == 1:
        print(c['label'], c['comment'])