In [1]:
import torch
import torch.nn as nn
import numpy as np
from preprocessing import cyrillize

from get_embeddings import get_noise_dampening_embedding, get_sub_word_tokenization_embedding

import json
import random
from math import floor

# RNN classifier
Comment is marked as profane by classifying all prefixes of the comment. <br>
Training is done on prefixes ending with profanity, or whole sentences with no profanity. 

In [2]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{
        'comment': cyrillize(d['comment']),
        'label': d['label'],
        'examples': [cyrillize(e) for e in d['examples']] if 'examples' in d else None
    } for d in json.load(f) if 'label' in d]

In [46]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, input_size, hidden_size, classesCount, device, dropout_p=0.1, batch_size=32):
        super(RNNClassifier, self).__init__()
        self.batch_size = batch_size
        self.device = device
        self.hidden_size = hidden_size
        # self.lstmCell = nn.LSTMCell(input_size, hidden_size)
        # self.gruCell = nn.GRUCell(input_size, hidden_size)
        self.rnnCell = nn.RNNCell(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.classProjection = torch.nn.Linear(hidden_size, classesCount)

    def forward(self, seq):
        batch_size = len(seq)
        seq = torch.transpose(torch.tensor(seq), 0, 1)
        h = torch.zeros(batch_size, self.hidden_size, device = self.device)
        # c = torch.zeros(batch_size, self.hidden_size, device = self.device)
        seq = torch.tensor(seq, dtype=torch.float).to(self.device)
        for t in seq:
            # h, c = self.lstmCell(t, (h, c))
            h = self.rnnCell(t, h)
        z = self.classProjection(self.dropout(h))
        return z

    def classify(self, seq, k=3):
        self.eval()
        with torch.no_grad():
            h = torch.zeros(self.hidden_size, device = self.device)
            # c = torch.zeros(self.hidden_size, device = self.device)
            seq = torch.tensor(seq, dtype=torch.float).to(self.device)
            for i, t in enumerate(seq):
                # h, c = self.lstmCell(t, (h, c))
                h = self.rnnCell(t, h)
                if i >= k:
                    z = self.classProjection(self.dropout(h))
                    if torch.argmax(z).item() == 1:
                        return True
            z = self.classProjection(self.dropout(h))
            if torch.argmax(z).item() == 1:
                return True
        return False

    def groupByLength(self, X, idx):
        from collections import defaultdict

        grouped_dict = defaultdict(lambda: [])
        for id in idx:
            length = len(X[id])
            grouped_dict[length].append(id)
        return grouped_dict

    def fit(self, X, Y, epochs=5):
        ''' X - embeddings of sentences.
            All profane sentences must have their profanity at the end.
            Y - classification of the sentences
        '''
        self.rnnCell.reset_parameters()
        self.classProjection.reset_parameters()
        self.train()

        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        Y = np.array(Y)
        for _ in range(epochs):
            groups = list(self.groupByLength(X, idx).values())
            random.shuffle(groups)
            batches = [(
                        [X[i] for i in group[b:min(b+self.batch_size, len(group))]],
                        [Y[i] for i in group[b:min(b+self.batch_size, len(group))]]
                    ) for group in groups
                      for b in range(0, len(group), self.batch_size)]
            random.shuffle(batches)
            for x, y in batches:
                batch = x
                target = torch.tensor(y, dtype = torch.long, device = self.device)
                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
        self.eval()

In [4]:
def gamma(model, embedded_s):
    return model.classify(embedded_s)

def test_model(model, embedding, testing_set):
    tp, fn, fp, tn = 0, 0, 0, 0

    for comment in testing_set:
        embedded_s = embedding(comment['comment'])
        if len(embedded_s) == 0:
            continue
        if gamma(model, embedded_s):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def generate_sub_comments(comment: str, profanities):
    result = []
    for profanity in profanities:
        index = comment.find(profanity)
        if index == -1:
            print(comment, profanity)
        assert(index != -1)
        result.append(comment[:index + len(profanity)])

    return result

def k_cross_validation(model, supervised_comments, embedding, k, balanced_classes: bool = False, p_n_rate = 1.0):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = []
        for comment in train_records:
            if comment['label'] == 'p':
                training_set += [(embedding(c), 1) for c in generate_sub_comments(comment['comment'], comment['examples'])]
            else:
                training_set.append((embedding(comment['comment']), 0))

        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]

        train_sampled_data = positive_train + negative_train
        if balanced_classes:
            train_sampled_data = negative_train
            train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*len(positive_train)))

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y)

        tp, fn, fp, tn = test_model(model, embedding, test_records)
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

In [5]:
batch_size = 32
input_size = 100
device = 'cuda'

sub_word_embedding = get_sub_word_tokenization_embedding(input_size)
noise_dampening_embedding = get_noise_dampening_embedding(input_size, device)

In [48]:
classModel = RNNClassifier(100, 100, 2, device).to(device)

lstm

In [55]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, balanced_classes=False))

1619 2238


  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


1665 2204
1653 2218
1652 2206
1667 2197
1676 2202
1667 2193
1635 2221
1539 2269
1341 2356
1786 2476
Precision: 0.3492852703542573
Recall: 0.9714779602420052
F1-score: 0.5138285714285715
Confusion Matrix:
                Predicted p Predicted n
Actual p          74.933    2.200
Actual n         139.600   25.600


0.5138285714285715

gru

In [63]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, balanced_classes=False))

  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


Precision: 0.34713076199435555
Recall: 0.9567847882454624
F1-score: 0.5094339622641508
Confusion Matrix:
                Predicted p Predicted n
Actual p          73.800    3.333
Actual n         138.800   26.400


0.5094339622641508

rnn

In [66]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, balanced_classes=False))

  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


Precision: 0.3400770712909441
Recall: 0.9152981849611064
F1-score: 0.49590259892296884
Confusion Matrix:
                Predicted p Predicted n
Actual p          70.600    6.533
Actual n         137.000   28.200


0.49590259892296884

rnn without dropout on test

In [7]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, balanced_classes=False))

Precision: 0.3374681933842239
Recall: 0.9170267934312878
F1-score: 0.4933736340385956
Confusion Matrix:
                Predicted p Predicted n
Actual p          70.733    6.400
Actual n         138.867   26.333


0.4933736340385956

In [57]:
classModel = RNNClassifier(100, 100, 2, device).to(device)
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, balanced_classes=False))

  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


Precision: 0.33688186813186816
Recall: 0.8478824546240277
F1-score: 0.4821823543868273
Confusion Matrix:
                Predicted p Predicted n
Actual p          65.400   11.733
Actual n         128.733   36.467


0.4821823543868273

Shuffled groups

In [None]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, balanced_classes=False))

In [49]:
train_records = supervised_comments
training_set = []
for comment in train_records:
    if comment['label'] == 'p':
        training_set += [(sub_word_embedding(c), 1) for c in generate_sub_comments(comment['comment'], comment['examples'])]
    else:
        training_set.append((sub_word_embedding(comment['comment']), 0))

train_x, train_y = [a[0] for a in training_set], [a[1] for a in training_set]
classModel.fit(train_x, train_y)

  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


In [45]:
classModel.eval()
classModel.forward([sub_word_embedding('Дърти мангали проститутки')])

  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


tensor([[-1.3101,  1.4221]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [56]:
classModel.classify(sub_word_embedding('Какво става мила, много си хубава днес, да ти еба майката.'))

True

Noise dampening embedding

In [None]:
print_test_model(*k_cross_validation(classModel, supervised_comments, noise_dampening_embedding, k=10, balanced_classes=False))