In [1]:
import torch
import torch.nn as nn
import numpy as np
from preprocessing import cyrillize

from get_embeddings import get_noise_dampening_embedding, get_sub_word_tokenization_embedding, get_fast_text_embedding

import json
import random
from math import floor

# RNN classifier
Comment is read and classified after every 'step' steps. <br>
Training is done on segments with about 'step' length. 

In [2]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{
        'comment': cyrillize(d['comment']),
        'label': d['label'],
        'examples': [cyrillize(e) for e in d['examples']] if 'examples' in d else []
    } for d in json.load(f) if 'label' in d]

In [3]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, hidden_size, classes_count, device, dropout_p=0.1, batch_size=32):
        super(RNNClassifier, self).__init__()
        self.batch_size = batch_size
        self.device = device
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout_p)
        self.lstmCell = nn.LSTMCell(hidden_size, hidden_size)
        self.feedForward = torch.nn.Linear(hidden_size, hidden_size)
        self.classProjection = torch.nn.Linear(hidden_size, classes_count)

    def forward(self, seq):
        batch_size = len(seq)
        seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)
        h = torch.zeros(batch_size, self.hidden_size, device = self.device)
        c = torch.zeros(batch_size, self.hidden_size, device = self.device)
        for t in seq:
            h, c = self.lstmCell(t, (h, c))
        h = self.feedForward(self.dropout(h))
        z = self.classProjection(torch.nn.functional.relu(h))
        return z

    def classify(self, seq, step):
        self.eval()
        with torch.no_grad():
            h = torch.zeros(self.hidden_size, device = self.device)
            c = torch.zeros(self.hidden_size, device = self.device)
            seq = torch.tensor(seq, dtype=torch.float).to(self.device)
            for i, t in enumerate(seq):
                h, c = self.lstmCell(t, (h, c))
                if i+1 % step == 0:
                    h = self.feedForward(self.dropout(h))
                    z = self.classProjection(torch.nn.functional.relu(h))
                    if torch.argmax(z).item() == 1:
                        return True
            h = self.feedForward(self.dropout(h))
            z = self.classProjection(torch.nn.functional.relu(h))
            if torch.argmax(z).item() == 1:
                return True
        return False

    def reset_parameters(self):
        self.lstmCell.reset_parameters()
        self.feedForward.reset_parameters()
        self.classProjection.reset_parameters()

    def groupByLength(self, X, idx):
        from collections import defaultdict

        grouped_dict = defaultdict(lambda: [])
        for id in idx:
            length = len(X[id])
            grouped_dict[length].append(id)
        return grouped_dict

    def fit(self, X, Y, epochs=5):
        ''' X - embeddings of sentences.
            All profane sentences must have their profanity at the end.
            Y - classification of the sentences
        '''
        self.reset_parameters()
        self.train()

        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        Y = np.array(Y)
        for _ in range(epochs):
            groups = list(self.groupByLength(X, idx).values())
            random.shuffle(groups)
            batches = [(
                        [X[i] for i in group[b:min(b+self.batch_size, len(group))]],
                        [Y[i] for i in group[b:min(b+self.batch_size, len(group))]]
                    ) for group in groups
                      for b in range(0, len(group), self.batch_size)]
            random.shuffle(batches)
            for x, y in batches:
                batch = x
                target = torch.tensor(y, dtype = torch.long, device = self.device)
                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
        self.eval()

In [4]:
def gamma(model, embedded_s, step):
    return model.classify(embedded_s, step)

def test_model(model, embedding, testing_set, step):
    tp, fn, fp, tn = 0, 0, 0, 0

    for comment in testing_set:
        embedded_s = embedding(comment['comment'])
        if len(embedded_s) == 0:
            continue
        if gamma(model, embedded_s, step):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def get_profanity_mask(comment: str, profanities: list[str]):
    profanity_mask = [False for _ in comment]
    for profanity in profanities:
        index = comment.find(profanity)
        if index == -1: continue
        for i in range(index, index+len(profanity)):
            profanity_mask[i] = True
    return profanity_mask

def get_non_whitespace_mask(comment: str):
    non_whitespace_mask = [False for _ in comment]
    for i, c in enumerate(comment):
        if not c.isspace():
            non_whitespace_mask[i] = True
    return non_whitespace_mask

def generate_sub_comments(comment: str, step: int, profanities: list[str]):
    profanity_mask = get_profanity_mask(comment, profanities)
    non_whitespace_mask = get_non_whitespace_mask(comment)
    segments_mask = [a or b for (a, b) in zip(profanity_mask, non_whitespace_mask)]
    parts = []
    segments = []
    current_segment = ''
    segment_count = 0
    current_part_profanity = 0
    for char, is_segment, is_profanity in zip(comment, segments_mask, profanity_mask):
        if is_segment:
            current_segment += char
            if is_profanity:
                current_part_profanity = 1
        else:
            if current_segment:
                segments.append(current_segment)
                current_segment = ''
                segment_count += 1

        if segment_count == step:
            parts.append((" ".join(segments), current_part_profanity))
            segments = []
            current_segment = ''
            current_part_profanity = 0
            segment_count = 0

    if current_segment:  # Add the remaining segment
        segments.append(current_segment)
        parts.append((" ".join(segments), current_part_profanity))

    return parts

def k_cross_validation(model, supervised_comments, embedding, k, step, epochs, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    random.shuffle(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = []
        for comment in train_records:
            training_set += [(embedding(c), p_flag) for c, p_flag in generate_sub_comments(comment['comment'], step, comment['examples'])]

        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]

        if balanced_classes and len(positive_train) < floor(p_n_rate*len(negative_train)):
            positive_train = random.choices(positive_train, k=floor(p_n_rate*len(negative_train)))
        print('pos : neg =', len(positive_train), ':', len(negative_train))

        train_sampled_data = positive_train + negative_train

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y, epochs)

        tp, fn, fp, tn = test_model(model, embedding, test_records, step)
        print(f"tp: {tp}, fn: {fn}, fp: {fp}, tn: {tn}")
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

## Experiments

In [5]:
batch_size = 32
input_size = 100
device = 'cuda'

Finding hyperparameters

In [9]:
classModel = RNNClassifier(100, 2, device).to(device)

Sub words with context embedding after spell checking

In [7]:
sub_word_embedding_spell_checked = get_sub_word_tokenization_embedding(input_size, spell_corection=True)

Loading dictionary...
Processing dictionary...
Copied 165258 words to master dictionary...
Copied 1118365 hashes to master dictionary...


In [68]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding_spell_checked, k=10, step=10, epochs=10, balanced_classes=True, p_n_rate=0.5))

3555 7111
tp: 41, fn: 103, fp: 17, tn: 274
3550 7100
tp: 99, fn: 33, fp: 100, tn: 203
3569 7138
tp: 93, fn: 47, fp: 62, tn: 233
3557 7115
tp: 21, fn: 124, fp: 10, tn: 280
3545 7090
tp: 63, fn: 68, fp: 47, tn: 257
3575 7150
tp: 60, fn: 79, fp: 32, tn: 264
3568 7137
tp: 105, fn: 49, fp: 85, tn: 196
3538 7076
tp: 91, fn: 36, fp: 101, tn: 207
3551 7102
tp: 93, fn: 38, fp: 64, tn: 240
3563 7126
tp: 75, fn: 54, fp: 63, tn: 243
Precision: 0.5605143721633888
Recall: 0.5400874635568513
F1-score: 0.5501113585746101
Confusion Matrix:
                Predicted p Predicted n
Actual p          74.100   63.100
Actual n          58.100  239.700


0.5501113585746101

Best step size

In [9]:
best_step = 1
best_f1 = 0
for step in [1, 2, 3, 5, 7, 10, 15, 20]:
    f1 = print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding_spell_checked, k=10, step=step, epochs=10, balanced_classes=True, p_n_rate=0.5))
    if f1 > best_f1:
        best_step = step
        best_f1 = f1

33651 67303


  seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)


tp: 139, fn: 2, fp: 280, tn: 20
33587 67174
tp: 149, fn: 1, fp: 264, tn: 27
34224 68449
tp: 139, fn: 1, fp: 261, tn: 40
33874 67749
tp: 132, fn: 0, fp: 289, tn: 20
33821 67642
tp: 140, fn: 2, fp: 277, tn: 22
33442 66885
tp: 143, fn: 0, fp: 288, tn: 10
33849 67698
tp: 115, fn: 2, fp: 298, tn: 26
33858 67717
tp: 133, fn: 0, fp: 298, tn: 10
33741 67482
tp: 129, fn: 0, fp: 294, tn: 18
33850 67701
tp: 157, fn: 2, fp: 265, tn: 17
37490 74980
tp: 6, fn: 0, fp: 3, tn: 0
Precision: 0.3291259823767564
Recall: 0.992816091954023
F1-score: 0.49436594526918265
Confusion Matrix:
                Predicted p Predicted n
Actual p          72.737    0.526
Actual n         148.263   11.053
16953 33906
tp: 98, fn: 29, fp: 160, tn: 154
17041 34082
tp: 130, fn: 16, fp: 171, tn: 124
16902 33804
tp: 129, fn: 9, fp: 207, tn: 96
16837 33675
tp: 130, fn: 5, fp: 212, tn: 94
17030 34060
tp: 130, fn: 10, fp: 230, tn: 71
16825 33650
tp: 122, fn: 13, fp: 224, tn: 82
16974 33949
tp: 103, fn: 45, fp: 96, tn: 197
16905 3

In [10]:
print(best_step)

7


Best p:n rate

In [11]:
best_p_n_rate = 0.3
best_f1 = 0
for p_n_rate in [0.3, 0.4, 0.5, 0.6, 0.7, 1, 1.5]:
    f1 = print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding_spell_checked, k=10, step=best_step, epochs=10, balanced_classes=True, p_n_rate=p_n_rate))
    if f1 > best_f1:
        best_p_n_rate = p_n_rate
        best_f1 = f1

3019 10065
tp: 45, fn: 105, fp: 19, tn: 272
3010 10034
tp: 38, fn: 99, fp: 22, tn: 282
3011 10038
tp: 20, fn: 117, fp: 7, tn: 297
3006 10023
tp: 23, fn: 110, fp: 9, tn: 299
2999 9999
tp: 9, fn: 131, fp: 7, tn: 294
3012 10043
tp: 66, fn: 69, fp: 59, tn: 247
3020 10069
tp: 81, fn: 61, fp: 71, tn: 228
3013 10044
tp: 11, fn: 116, fp: 7, tn: 307
3019 10066
tp: 58, fn: 82, fp: 54, tn: 247
3046 10155
tp: 42, fn: 107, fp: 19, tn: 273
3343 11144
tp: 1, fn: 1, fp: 0, tn: 7
Precision: 0.5898203592814372
Recall: 0.2830459770114942
F1-score: 0.38252427184466015
Confusion Matrix:
                Predicted p Predicted n
Actual p          20.737   52.526
Actual n          14.421  144.895
4017 10044
tp: 68, fn: 68, fp: 70, tn: 235
4004 10010
tp: 45, fn: 69, fp: 18, tn: 309
4022 10056
tp: 65, fn: 84, fp: 41, tn: 251
4021 10054
tp: 82, fn: 58, fp: 54, tn: 247
4026 10065
tp: 100, fn: 39, fp: 87, tn: 215
4031 10079
tp: 51, fn: 86, fp: 33, tn: 271
4025 10064
tp: 92, fn: 52, fp: 72, tn: 225
3995 9988
tp: 29,

In [12]:
print(best_p_n_rate)

0.7


In [11]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding_spell_checked, k=10, step=7, epochs=10, balanced_classes=True, p_n_rate=0.6))

6005 10009
tp: 107, fn: 21, fp: 129, tn: 184
6054 10090
tp: 102, fn: 51, fp: 70, tn: 218
6048 10080
tp: 79, fn: 65, fp: 69, tn: 228
6051 10086
tp: 74, fn: 63, fp: 60, tn: 244
6024 10041
tp: 107, fn: 37, fp: 104, tn: 193
6025 10043
tp: 127, fn: 13, fp: 167, tn: 134
6038 10064
tp: 67, fn: 74, fp: 40, tn: 260
6015 10026
tp: 95, fn: 46, fp: 88, tn: 212
6010 10018
tp: 64, fn: 62, fp: 34, tn: 281
6047 10079
tp: 71, fn: 66, fp: 50, tn: 254
6686 11144
tp: 0, fn: 1, fp: 4, tn: 4
Precision: 0.5228337236533958
Recall: 0.6415229885057471
F1-score: 0.5761290322580644
Confusion Matrix:
                Predicted p Predicted n
Actual p          47.000   26.263
Actual n          42.895  116.421


0.5761290322580644

Smaller number of epochs

In [12]:
print_test_model(*k_cross_validation(RNNClassifier(100, 2, device).to(device), supervised_comments, sub_word_embedding_spell_checked, k=10, step=7, epochs=5, balanced_classes=True, p_n_rate=0.6))

6048 10081
tp: 112, fn: 36, fp: 95, tn: 198
6039 10066
tp: 94, fn: 39, fp: 99, tn: 209
6004 10007
tp: 78, fn: 53, fp: 68, tn: 242
6042 10071
tp: 108, fn: 34, fp: 120, tn: 179
6028 10048
tp: 68, fn: 74, fp: 50, tn: 249
6008 10014
tp: 102, fn: 22, fp: 124, tn: 193
6059 10099
tp: 51, fn: 97, fp: 25, tn: 268
6030 10051
tp: 104, fn: 32, fp: 96, tn: 209
6039 10065
tp: 46, fn: 92, fp: 32, tn: 271
6022 10037
tp: 109, fn: 37, fp: 84, tn: 211
6684 11141
tp: 2, fn: 2, fp: 0, tn: 5
Precision: 0.5242951409718056
Recall: 0.6278735632183907
F1-score: 0.5714285714285713
Confusion Matrix:
                Predicted p Predicted n
Actual p          46.000   27.263
Actual n          41.737  117.579


0.5714285714285713

Bigger dropout

In [13]:
print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device), supervised_comments, sub_word_embedding_spell_checked, k=10, step=7, epochs=10, balanced_classes=True, p_n_rate=0.6))

5970 9951
tp: 64, fn: 76, fp: 62, tn: 239
5998 9998
tp: 77, fn: 65, fp: 56, tn: 243
6023 10039
tp: 100, fn: 37, fp: 74, tn: 230
6090 10150
tp: 101, fn: 52, fp: 82, tn: 206
6010 10018
tp: 82, fn: 45, fp: 88, tn: 226
6051 10085
tp: 110, fn: 25, fp: 128, tn: 178
6048 10081
tp: 133, fn: 20, fp: 152, tn: 136
6046 10077
tp: 54, fn: 84, fp: 35, tn: 268
6032 10054
tp: 88, fn: 45, fp: 56, tn: 252
6048 10081
tp: 108, fn: 24, fp: 128, tn: 181
6687 11146
tp: 2, fn: 0, fp: 2, tn: 5
Precision: 0.5157126823793491
Recall: 0.6602011494252873
F1-score: 0.579080025204789
Confusion Matrix:
                Predicted p Predicted n
Actual p          48.368   24.895
Actual n          45.421  113.895


0.579080025204789

Finding the best embedding

Sub word embedding

In [6]:
sub_word_embedding = get_sub_word_tokenization_embedding()

In [8]:
print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device),
                                    supervised_comments,
                                    sub_word_embedding,
                                    k=10,
                                    step=7,
                                    epochs=10,
                                    balanced_classes=True,
                                    p_n_rate=0.6))

6037 10063


  seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)


tp: 94, fn: 39, fp: 89, tn: 219
6012 10020
tp: 131, fn: 19, fp: 157, tn: 134
6022 10037
tp: 51, fn: 77, fp: 15, tn: 298
6009 10015
tp: 114, fn: 28, fp: 106, tn: 193
6027 10046
tp: 94, fn: 38, fp: 74, tn: 235
6062 10104
tp: 37, fn: 104, fp: 19, tn: 281
6027 10046
tp: 40, fn: 93, fp: 22, tn: 286
6025 10043
tp: 106, fn: 33, fp: 135, tn: 167
6051 10085
tp: 72, fn: 70, fp: 54, tn: 245
6046 10077
tp: 89, fn: 58, fp: 79, tn: 215
6686 11144
tp: 2, fn: 3, fp: 0, tn: 4
Precision: 0.5253164556962024
Recall: 0.5962643678160919
F1-score: 0.5585464333781964
Confusion Matrix:
                Predicted p Predicted n
Actual p          43.684   29.579
Actual n          39.474  119.842


0.5585464333781964

Noise dampening embedding

In [9]:
noise_dampening_embedding = get_noise_dampening_embedding(input_size, device)

In [10]:
print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device),
                                    supervised_comments,
                                    noise_dampening_embedding,
                                    k=10,
                                    step=7,
                                    epochs=10,
                                    balanced_classes=True,
                                    p_n_rate=0.6))

6030 10050
tp: 80, fn: 46, fp: 68, tn: 247
5968 9948
tp: 77, fn: 85, fp: 31, tn: 248
6115 10192
tp: 0, fn: 146, fp: 0, tn: 295
6010 10018
tp: 106, fn: 33, fp: 132, tn: 170
6001 10003
tp: 65, fn: 68, fp: 28, tn: 280
6043 10073
tp: 98, fn: 41, fp: 109, tn: 193
6062 10104
tp: 67, fn: 72, fp: 31, tn: 270
6025 10043
tp: 81, fn: 63, fp: 30, tn: 267
6033 10055
tp: 48, fn: 63, fp: 20, tn: 309
6025 10042
tp: 97, fn: 55, fp: 71, tn: 217
6691 11152
tp: 0, fn: 1, fp: 1, tn: 7
Precision: 0.5798387096774194
Recall: 0.5165229885057471
F1-score: 0.5463525835866262
Confusion Matrix:
                Predicted p Predicted n
Actual p          37.842   35.421
Actual n          27.421  131.737


0.5463525835866262

Fast text embedding

In [6]:
fast_text_embedding = get_fast_text_embedding()

In [12]:
print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device),
                                    supervised_comments,
                                    fast_text_embedding,
                                    k=10,
                                    step=7,
                                    epochs=10,
                                    balanced_classes=True,
                                    p_n_rate=0.6))

6018 10030
tp: 114, fn: 30, fp: 63, tn: 234
6007 10012
tp: 105, fn: 27, fp: 65, tn: 244
5994 9990
tp: 106, fn: 44, fp: 27, tn: 264
6065 10109
tp: 102, fn: 36, fp: 61, tn: 242
6065 10109
tp: 106, fn: 39, fp: 50, tn: 246
6022 10037
tp: 103, fn: 38, fp: 41, tn: 259
6035 10059
tp: 82, fn: 47, fp: 30, tn: 282
6037 10062
tp: 91, fn: 45, fp: 34, tn: 271
6018 10031
tp: 124, fn: 17, fp: 69, tn: 230
6057 10096
tp: 97, fn: 35, fp: 43, tn: 264
6687 11145
tp: 4, fn: 0, fp: 2, tn: 3
Precision: 0.6807109940750493
Recall: 0.742816091954023
F1-score: 0.7104087942287873
Confusion Matrix:
                Predicted p Predicted n
Actual p          54.421   18.842
Actual n          25.526  133.632


0.7104087942287873

Tuning step size for fast text

In [8]:
best_step = 1
best_f1 = 0
for step in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    f1 = print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device),
                                    supervised_comments,
                                    fast_text_embedding,
                                    k=10,
                                    step=step,
                                    epochs=10,
                                    balanced_classes=True,
                                    p_n_rate=0.6))
    if f1 > best_f1:
        best_step = step
        best_f1 = f1

40527 67546


  seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)


tp: 144, fn: 1, fp: 276, tn: 20
40513 67523
tp: 145, fn: 2, fp: 269, tn: 25
40539 67566
tp: 138, fn: 2, fp: 282, tn: 19
40391 67319
tp: 128, fn: 0, fp: 296, tn: 16
40558 67597
tp: 123, fn: 2, fp: 288, tn: 27
40717 67863
tp: 136, fn: 11, fp: 204, tn: 90
40558 67597
tp: 146, fn: 0, fp: 282, tn: 13
40479 67465
tp: 131, fn: 2, fp: 289, tn: 19
40620 67701
tp: 140, fn: 0, fp: 273, tn: 27
40600 67667
tp: 135, fn: 3, fp: 288, tn: 15
44961 74936
tp: 3, fn: 0, fp: 6, tn: 0
Precision: 0.3321203299369238
Recall: 0.9834770114942528
F1-score: 0.4965542256075444
Confusion Matrix:
                Predicted p Predicted n
Actual p          72.053    1.211
Actual n         144.895   14.263
20308 33848
tp: 135, fn: 3, fp: 250, tn: 53
20367 33946
tp: 133, fn: 1, fp: 252, tn: 54
20375 33959
tp: 97, fn: 31, fp: 108, tn: 205
20196 33660
tp: 136, fn: 4, fp: 242, tn: 59
20330 33884
tp: 131, fn: 8, fp: 219, tn: 83
20328 33881
tp: 133, fn: 6, fp: 236, tn: 66
20234 33724
tp: 134, fn: 2, fp: 273, tn: 32
20464 34107

Grid search for step size and p_n_rate

In [8]:
best_step_p_n_rate = (7, 0.7)
best_f1 = 0
for step in [5, 7, 10, 12, 15]:
    for p_n_rate in [0.5, 0.7, 1, 1.2]:
        f1 = print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device),
                                        supervised_comments,
                                        fast_text_embedding,
                                        k=10,
                                        step=step,
                                        epochs=10,
                                        balanced_classes=True,
                                        p_n_rate=p_n_rate))
        if f1 > best_f1:
            best_step_p_n_rate = (step, p_n_rate)
            best_f1 = f1

pos : neg = 6855 : 13711
tp: 113, fn: 34, fp: 38, tn: 256
pos : neg = 6910 : 13821
tp: 118, fn: 19, fp: 115, tn: 189
pos : neg = 6894 : 13788
tp: 111, fn: 19, fp: 104, tn: 206
pos : neg = 6971 : 13943
tp: 126, fn: 28, fp: 113, tn: 174
pos : neg = 6846 : 13693
tp: 90, fn: 36, fp: 76, tn: 239
pos : neg = 6865 : 13731
tp: 115, fn: 14, fp: 106, tn: 205
pos : neg = 6883 : 13766
tp: 126, fn: 11, fp: 129, tn: 175
pos : neg = 6873 : 13747
tp: 127, fn: 24, fp: 83, tn: 207
pos : neg = 6923 : 13847
tp: 115, fn: 27, fp: 48, tn: 250
pos : neg = 6869 : 13738
tp: 91, fn: 44, fp: 46, tn: 260
pos : neg = 7642 : 15285
tp: 2, fn: 2, fp: 1, tn: 4
Precision: 0.5689914701455093
Recall: 0.8146551724137931
F1-score: 0.6700147710487445
Confusion Matrix:
                Predicted p Predicted n
Actual p          59.684   13.579
Actual n          45.211  113.947
pos : neg = 9566 : 13667
tp: 103, fn: 19, fp: 100, tn: 218
pos : neg = 9643 : 13777
tp: 84, fn: 42, fp: 50, tn: 265
pos : neg = 9681 : 13831
tp: 125, fn:

In [9]:
print(best_step_p_n_rate)

(15, 0.7)


In [11]:
best_f1 = 0
for step in [14, 15, 17, 20, 25]:
    for p_n_rate in [0.6, 0.7, 0.8]:
        f1 = print_test_model(*k_cross_validation(RNNClassifier(100, 2, device, dropout_p=0.3).to(device),
                                        supervised_comments,
                                        fast_text_embedding,
                                        k=10,
                                        step=step,
                                        epochs=10,
                                        balanced_classes=True,
                                        p_n_rate=p_n_rate))
        if f1 > best_f1:
            best_step_p_n_rate = (step, p_n_rate)
            best_f1 = f1

pos : neg = 3291 : 5486
tp: 115, fn: 23, fp: 47, tn: 256
pos : neg = 3300 : 5500
tp: 100, fn: 37, fp: 40, tn: 264
pos : neg = 3288 : 5481
tp: 113, fn: 20, fp: 69, tn: 238
pos : neg = 3284 : 5474
tp: 106, fn: 33, fp: 52, tn: 249
pos : neg = 3282 : 5470
tp: 105, fn: 18, fp: 84, tn: 234
pos : neg = 3307 : 5512
tp: 81, fn: 60, fp: 29, tn: 271
pos : neg = 3307 : 5512
tp: 115, fn: 35, fp: 46, tn: 245
pos : neg = 3293 : 5489
tp: 94, fn: 42, fp: 38, tn: 267
pos : neg = 3298 : 5498
tp: 103, fn: 41, fp: 31, tn: 266
pos : neg = 3288 : 5480
tp: 113, fn: 35, fp: 50, tn: 242
pos : neg = 3652 : 6088
tp: 0, fn: 3, fp: 0, tn: 6
Precision: 0.6825604180274331
Recall: 0.7507183908045977
F1-score: 0.715018816284639
Confusion Matrix:
                Predicted p Predicted n
Actual p          55.000   18.263
Actual n          25.579  133.579
pos : neg = 3859 : 5514
tp: 90, fn: 43, fp: 27, tn: 280
pos : neg = 3831 : 5473
tp: 96, fn: 49, fp: 29, tn: 267
pos : neg = 3854 : 5507
tp: 109, fn: 31, fp: 73, tn: 228
p

In [13]:
print(best_step_p_n_rate)
print(best_f1)

(14, 0.8)
0.718228279386712


Demonstration

In [7]:
model = RNNClassifier(100, 2, device, dropout_p=0.3).to(device)
step = 14
p_n_rate = 0.8

training_set = []
for comment in supervised_comments:
    training_set += [(fast_text_embedding(c), p_flag) for c, p_flag in generate_sub_comments(comment['comment'], step, comment['examples'])]

positive_train = [a for a in training_set if a[1] == 1]
negative_train = [a for a in training_set if a[1] == 0]

positive_train = random.choices(positive_train, k=floor(p_n_rate*len(negative_train)))
print('pos : neg =', len(positive_train), ':', len(negative_train))

train_sampled_data = positive_train + negative_train

train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
model.fit(train_x, train_y, 10)

pos : neg = 4879 : 6099


  seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)


In [19]:
model.classify(fast_text_embedding('Аз искам да видя ... '), 14)

False