In [1]:
import torch
import numpy as np
from tokenizers import Tokenizer
from preprocessing import cyrillize

import json
import random
import math

In [14]:
def get_sentences(f) -> list[str]:
    all = []
    for a in json.load(f):
        sentences = a['content'].split('.')
        all.append(a['name'])
        all += sentences

    return all

In [15]:
with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    unsupervised_comments = json.load(f)

with open('data/blitz_articles.json', 'r', encoding="utf-8") as f:
    sentences = get_sentences(f)

with open('data/dnes_bg_articles.json', 'r', encoding="utf-8") as f:
    sentences += get_sentences(f)

with open('data/pik_articles.json', 'r', encoding="utf-8") as f:
    sentences += get_sentences(f)


unsupervised_corpus = unsupervised_comments + sentences

Creating the supervised corpus

In [17]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{'comment': cyrillize(d['comment']), 'label': d['label']} for d in json.load(f) if 'label' in d]

supervised_p = len([s for s in supervised_comments if s['label'] == 'p'])
supervised_n = len([s for s in supervised_comments if s['label'] == 'p'])

In [18]:
with open('data/bgjargon.json', 'r', encoding="utf-8") as f:
    bgjargon = json.load(f)

with open('data/bad_words_2.json', 'r', encoding="utf-8") as f:
    bad_words = set(json.load(f))

for w in bad_words:
    if w in bgjargon:
        for meaning in bgjargon[w]['meanings']:
            if len(meaning['example']) > 0 and supervised_p < supervised_n:
                supervised_p += 1
                supervised_comments.append({'comment': meaning['example'], 'label': 'p'})

Tokenizing the comments

In [19]:
tokenizer = Tokenizer.from_file("data/tokenizer_comments_bgjargon_articles_end_start.json")
token2ind = tokenizer.get_vocab()
ind2token = tokenizer.id_to_token

In [20]:
unsupervised_corpus = [tokenizer.encode(c).tokens for c in unsupervised_corpus]
supervised_comments = [{'comment': tokenizer.encode(c['comment']).tokens, 'label': c['label']} for c in supervised_comments]

In [21]:
def split_comments(supervised_comments, test_fraction = 0.1):
    random.seed(42)
    random.shuffle(supervised_comments)
    test_count = int(len(supervised_comments) * test_fraction)
    test_comments = supervised_comments[:test_count]
    train_comments = supervised_comments[test_count:]
    return test_comments, train_comments

Pre training BiLSTM on the unsuppervised corpus

In [22]:
start_token = '[STR]'
end_token = '[END]'
unk_token = '[UNK]'
pad_token = '[PAD]'

batchSize = 32
emb_size = 50
hid_size = 100

device = torch.device("cuda:0")
test_comments, train_comments  = split_comments(supervised_comments, test_fraction = 0.05)

In [23]:
class BiLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, token2ind, unk_token, pad_token, end_token):
        super(BiLSTMLanguageModelPack, self).__init__()
        self.token2ind = token2ind
        self.unk_token_idx = token2ind[unk_token]
        self.pad_token_idx = token2ind[pad_token]
        self.end_token_idx = token2ind[end_token]
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.embed = torch.nn.Embedding(len(token2ind), embed_size)
        self.projection = torch.nn.Linear(2*hidden_size,len(token2ind))

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        comments = [[self.token2ind.get(w,self.unk_token_idx) for w in s] for s in source]
        comments_padded = [ s+(m-len(s))*[self.pad_token_idx] for s in comments]
        return torch.t(torch.tensor(comments_padded, dtype=torch.long, device=device))

    def forward(self, source):
        batch_size = len(source)
        X = self.preparePaddedBatch(source)
        E = self.embed(X)

        source_lengths = [len(s) for s in source]
        m = X.shape[0]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))

        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        output = output.view(m, batch_size, 2, self.hidden_size)
        # left to right and right to left in shift the prediction to i + 1 and i - 1 for the right to left
        # we shift them to make them predict a single position
        t = torch.cat((output[:-2,:,0,:], output[2:,:,1,:]),2)
        Z = self.projection(t.flatten(0,1))

        Y_bar = X[1:-1].flatten(0,1)
        Y_bar[Y_bar==self.end_token_idx] = self.pad_token_idx
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.pad_token_idx)
        return H

In [24]:
blm = BiLSTMLanguageModelPack(emb_size, hid_size, token2ind, unk_token, pad_token, end_token).to(device)
optimizer = torch.optim.Adam(blm.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(1):
    for b in range(0, len(idx), batchSize):
        batch = [unsupervised_corpus[i] for i in idx[b:min(b+batchSize, len(idx))]]
        H = blm(batch)
        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 100 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 2663 10.315420150756836
0 : 800 / 2663 8.048338890075684
0 : 1600 / 2663 7.726192951202393
0 : 2400 / 2663 7.463230133056641


In [25]:
def perplexity(lm, test_comments, batchSize):
    H = 0.
    c = 0
    for b in range(0,len(test_comments),batchSize):
        batch = test_comments[b:min(b+batchSize, len(test_comments))]
        l = sum(len(s)-1 for s in batch)
        c += l
        with torch.no_grad():
            H += l * lm(batch)
    return math.exp(H/c)

In [26]:
perplexity(blm, [c['comment'] for c in supervised_comments], batchSize)

1885.7760400400466

Fine tuning on the supervised comments

In [27]:
class BiLSTMClassifier(torch.nn.Module):
    def __init__(self, langModel):
        super(BiLSTMClassifier, self).__init__()
        self.langModel = langModel
        self.classProjection = torch.nn.Linear(2*langModel.hidden_size, 2)

    def forward(self, source):
        batch_size = len(source)
        X = self.langModel.preparePaddedBatch(source)
        E = self.langModel.embed(X)
        source_lengths = [len(s) for s in source]
        _, (h,c) = self.langModel.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths, enforce_sorted=False))
        h = h.view(2,batch_size,self.langModel.hidden_size)

        Z = self.classProjection(torch.cat([h[0],h[1]],1))
        return Z

In [28]:
train_y = np.array([(0 if c['label'] == 'n' else 1) for c in train_comments])
test_y = np.array([(0 if c['label'] == 'n' else 1) for c in test_comments])

idx = np.arange(len(train_comments), dtype='int32')

classModel = BiLSTMClassifier(blm).to(device)
optimizer = torch.optim.Adam(classModel.parameters(), lr=0.01)

idx = np.arange(len(train_comments), dtype='int32')
np.random.shuffle(idx)

for e in range(1):
    for b in range(0, len(idx), batchSize):
        batch = [ train_comments[i] for i in idx[b:min(b+batchSize, len(idx))] ]
        target = torch.tensor(train_y[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)

        Z = classModel(batch)
        H = torch.nn.functional.cross_entropy(Z,target)

        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 10 == 0:
            print(e, ':', b, '/', len(idx), H.item())

0 : 0 / 2663 0.7045396566390991
0 : 160 / 2663 0.6022505164146423
0 : 320 / 2663 0.599667489528656
0 : 480 / 2663 0.48963385820388794
0 : 640 / 2663 0.48910659551620483
0 : 800 / 2663 0.6002494096755981
0 : 960 / 2663 0.5976040959358215
0 : 1120 / 2663 0.7173306345939636
0 : 1280 / 2663 0.4525021016597748
0 : 1440 / 2663 0.5641239285469055
0 : 1600 / 2663 0.5953755974769592
0 : 1760 / 2663 0.4976104199886322
0 : 1920 / 2663 0.505456805229187
0 : 2080 / 2663 0.40512987971305847
0 : 2240 / 2663 0.5284277200698853
0 : 2400 / 2663 0.4939476549625397
0 : 2560 / 2663 0.6218875050544739


In [29]:
p_class = [c['comment'] for c in test_comments if c['label'] == 'p']
n_class = [c['comment'] for c in test_comments if c['label'] == 'n']
test_classes = [p_class, n_class]

def predict(s, class_model):
    with torch.no_grad():
        Z = class_model([s])
        return torch.argmax(Z[0]).item()

def testClassifier(class_model):
    confusionMatrix = [[0, 0], [0, 0]]
    for c in range(2):
        for text in test_classes[c]:
            c_MAP = predict(text, class_model)
            confusionMatrix[c][c_MAP] += 1

    sum_positive = sum(confusionMatrix[x][0] for x in range(2))
    precision = confusionMatrix[0][0] / sum_positive
    recall = confusionMatrix[0][0] / len(p_class)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('=================================================================')
    print('Confusion matrix: ')
    for row in confusionMatrix:
        for val in row:
            print('{:4}'.format(val), end = '')
        print()
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))

In [30]:
testClassifier(classModel)

Confusion matrix: 
  35   0
 105   0
Precision: 0.25
Recall: 1.0
F1-score: 0.4


In [None]:
for c in supervised_comments:
    if predict(c['comment'], classModel) == 1:
        print(c['label'], c['comment'])

n ['[STR]', 'на', 'път', 'от', 'село', 'е', 'минал', 'покрай', 'шо', 'н', 'пен.', '..', '[END]']
p ['[STR]', 'ей,', 'хомосексуали', 'ст!', '[END]']
p ['[STR]', 'она', 'е', 'сго', 'дна', 'за', 'ска', 'чка', '[END]']
n ['[STR]', 'да', 'слага', 'край,', 'на', 'пеенето', '.', 'няма', 'деца,', 'като', 'стояна', ',да', 'ги', 'хранту', 'ти.', '[END]']
p ['[STR]', 'виж', 'го', 'тоя', "к'ъв", 'е', 'шмай', 'зер', '.', '[END]']
p ['[STR]', 'мер', 'си', 'ама', 'аз', 'си', 'имам', 'негър', 'с', 'голям', 'чеп', '.', '[END]']
p ['[STR]', 'той', 'е', 'такъв', 'кю', 'лия,', 'че', 'ме', 'отвра', 'щава', 'с', 'поведението', 'си.', '[END]']
n ['[STR]', 'много', 'коп', 'чета', 'и', 'пръстен', 'и?', '!', 'и', 'тези', 'къ', 'дри', 'ци', 'на', 'фолка', 'джий', 'йка', 'май', 'нев', 'ър', 'вой', '?!', '[END]']
n ['[STR]', 'малката', ',', 'мислиш', 'ли,', 'че', 'тези', 'джуки', 'могат', 'да', 'привличат', 'мъжете.', 'да,', 'ама', 'не.', 'те', 'търсят', 'жени', 'с', 'неж', 'ни', 'и', 'чув', 'ствени', 'ус