In [1]:
import torch
import torch.nn as nn
import numpy as np
from preprocessing import cyrillize

from get_embeddings import get_noise_dampening_embedding, get_sub_word_tokenization_embedding

import json
import random
from math import floor

In [2]:
import json
from symspell import SymSpell
from preprocessing import preprocess

with open('data/unsupervised_comments.json', 'r', encoding="utf-8") as f:
    unsupervised_comments = json.load(f)

ss = SymSpell(max_dictionary_edit_distance=2)
for comment in unsupervised_comments:
    for token in preprocess(comment):
        ss._create_dictionary_entry(token, 1)

ss.save_complete_model_as_json('data/symspell_model_unsupervised.json')

Saving dictionary...
Saved dictionary...


# RNN classifier
Comment is marked as profane by classifying all prefixes of the comment. <br>
Training is done on prefixes before profanity, prefixes ending with profanity, or whole sentences with no profanity. 

In [3]:
with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
    supervised_comments = [{
        'comment': cyrillize(d['comment']),
        'label': d['label'],
        'examples': [cyrillize(e) for e in d['examples']] if 'examples' in d else None
    } for d in json.load(f) if 'label' in d]

In [4]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, hidden_size, classesCount, device, learnableEmbedding=False, vocab_size=0, dropout_p=0.1, batch_size=32):
        super(RNNClassifier, self).__init__()
        self.batch_size = batch_size
        self.device = device
        self.hidden_size = hidden_size
        self.learnableEmbedding = learnableEmbedding
        if self.learnableEmbedding:
            self.embed = torch.nn.Embedding(vocab_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.gruCell = nn.GRUCell(hidden_size, hidden_size)
        self.feedForward = torch.nn.Linear(hidden_size, hidden_size)
        self.classProjection = torch.nn.Linear(hidden_size, classesCount)

    def forward(self, seq):
        batch_size = len(seq)
        if self.learnableEmbedding:
            seq = torch.stack([self.embed(s.to(self.device)) for s in seq])
        seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)
        h = torch.zeros(batch_size, self.hidden_size, device = self.device)
        for t in seq:
            h = self.gruCell(t, h)
        h = self.feedForward(self.dropout(h))
        z = self.classProjection(torch.nn.functional.relu(h))
        return z

    def classify(self, seq):
        self.eval()
        with torch.no_grad():
            h = torch.zeros(self.hidden_size, device = self.device)
            if self.learnableEmbedding:
                seq = self.embed(seq.to(self.device))
            seq = torch.tensor(seq, dtype=torch.float).to(self.device)
            for t in seq:
                h = self.gruCell(t, h)
                h = self.feedForward(self.dropout(h))
                z = self.classProjection(torch.nn.functional.relu(h))
                if torch.argmax(z).item() == 1:
                    return True
            h = self.feedForward(self.dropout(h))
            z = self.classProjection(torch.nn.functional.relu(h))
            if torch.argmax(z).item() == 1:
                return True
        return False

    def reset_parameters(self):
        if self.learnableEmbedding:
            self.embed.reset_parameters()
        self.gruCell.reset_parameters()
        self.feedForward.reset_parameters()
        self.classProjection.reset_parameters()

    def groupByLength(self, X, idx):
        from collections import defaultdict

        grouped_dict = defaultdict(lambda: [])
        for id in idx:
            length = len(X[id])
            grouped_dict[length].append(id)
        return grouped_dict

    def fit(self, X, Y, epochs=5):
        ''' X - embeddings of sentences.
            All profane sentences must have their profanity at the end.
            Y - classification of the sentences
        '''
        self.reset_parameters()
        self.train()

        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=0.0002)
        idx = np.arange(len(X), dtype='int32')
        Y = np.array(Y)
        for _ in range(epochs):
            groups = list(self.groupByLength(X, idx).values())
            random.shuffle(groups)
            batches = [(
                        [X[i] for i in group[b:min(b+self.batch_size, len(group))]],
                        [Y[i] for i in group[b:min(b+self.batch_size, len(group))]]
                    ) for group in groups
                      for b in range(0, len(group), self.batch_size)]
            random.shuffle(batches)
            for x, y in batches:
                batch = x
                target = torch.tensor(y, dtype = torch.long, device = self.device)
                Z = self.forward(batch)
                H = torch.nn.functional.cross_entropy(Z,target)

                optimizer.zero_grad()
                H.backward()
                optimizer.step()
        self.eval()

In [5]:
def gamma(model, embedded_s):
    return model.classify(embedded_s)

def test_model(model, embedding, testing_set):
    tp, fn, fp, tn = 0, 0, 0, 0

    for comment in testing_set:
        embedded_s = embedding(comment['comment'])
        if len(embedded_s) == 0:
            continue
        if gamma(model, embedded_s):
            if comment['label'] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment['label'] == 'p':
                fn += 1
            else:
                tn += 1
    return tp, fn, fp, tn

def print_test_model(tp, fn, fp, tn):
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    Fscore = (2.0 * precision * recall) / (precision + recall)
    print('Precision: '+str(precision))
    print('Recall: '+str(recall))
    print('F1-score: '+str(Fscore))
    print('Confusion Matrix:')
    print('{:15} {:>8} {:>8}'.format('', 'Predicted p', 'Predicted n'))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual p', tp, fn))
    print('{:15} {:>8.3f} {:>8.3f}'.format('Actual n', fp, tn))
    return Fscore

def generate_sub_comments(comment: str, profanities):
    result = []
    first_profanity_index = len(comment)
    for profanity in profanities:
        index = comment.find(profanity)
        if index < first_profanity_index:
            first_profanity_index = index
        if index == -1:
            print(comment, profanity)
            assert(index != -1)
        result.append(comment[:index + len(profanity)])

    return comment[:first_profanity_index], result

def generate_sub_comments_without_profanity(comment: str, skip=3):
    result = []
    for i in range(skip, len(comment), skip):
        result.append(comment[:i])
    return result

def k_cross_validation(model, supervised_comments, embedding, k, epochs, balanced_classes: bool = False, p_n_rate = 1.0,):
    n = len(supervised_comments)
    m = n//k
    t = n//m + n%m

    tps, fns, fps, tns = 0, 0, 0, 0

    for i in range(0, n, m):
        test_records = supervised_comments[i:i+m]
        train_records = supervised_comments[0:i] + supervised_comments[i+m:n]
        training_set = []
        for comment in train_records:
            if comment['label'] == 'p':
                before_profanity, profanities = generate_sub_comments(comment['comment'], comment['examples'])
                if len(before_profanity) > 0:
                    training_set.append((embedding(before_profanity), 0))
                training_set += [(embedding(c), 1) for c in profanities]
            else:
                training_set += [(embedding(c), 0) for c in generate_sub_comments_without_profanity(comment['comment'])]

        positive_train = [a for a in training_set if a[1] == 1]
        negative_train = [a for a in training_set if a[1] == 0]

        train_sampled_data = positive_train + negative_train
        if balanced_classes:
            train_sampled_data = negative_train
            train_sampled_data += random.choices(positive_train, k=floor(p_n_rate*len(positive_train)))

        train_x, train_y = [a[0] for a in train_sampled_data], [a[1] for a in train_sampled_data]
        model.fit(train_x, train_y, epochs)

        tp, fn, fp, tn = test_model(model, embedding, test_records)
        print(f"tp: {tp}, fn: {fn}, fp: {fp}, tn: {tn}")
        tps += tp
        fns += fn
        fps += fp
        tns += tn

    return tps/t, fns/t, fps/t, tns/t

## Experiments

In [6]:
batch_size = 32
input_size = 100
device = 'cuda'

Sub words with learnable embedding

In [7]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("data/tokenizer_comments.json")
sub_word_encoding = lambda x: torch.tensor(tokenizer.encode(x.lower()).ids, dtype=torch.long)

classModel = RNNClassifier(100, 2, device, learnableEmbedding=True, vocab_size=tokenizer.get_vocab_size()).to(device)
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_encoding, k=10, epochs=10, balanced_classes=False))

  seq = torch.transpose(torch.tensor(seq, dtype=torch.float), 0, 1).to(self.device)
  seq = torch.tensor(seq, dtype=torch.float).to(self.device)


tp: 0, fn: 139, fp: 0, tn: 269


: 

Sub words with context embedding

In [None]:
sub_word_embedding = get_sub_word_tokenization_embedding(input_size)

In [None]:
classModel = RNNClassifier(100, 2, device).to(device)

In [None]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding, k=10, epochs=10, balanced_classes=False))

tp: 127, fn: 10, fp: 235, tn: 31
tp: 101, fn: 0, fp: 301, tn: 1
tp: 110, fn: 0, fp: 293, tn: 0
tp: 103, fn: 0, fp: 300, tn: 0
tp: 83, fn: 3, fp: 310, tn: 7
tp: 90, fn: 0, fp: 313, tn: 0
tp: 93, fn: 1, fp: 306, tn: 3
tp: 173, fn: 0, fp: 230, tn: 0
tp: 258, fn: 0, fp: 145, tn: 0
tp: 46, fn: 129, fp: 38, tn: 190
tp: 3, fn: 0, fp: 2, tn: 0
Precision: 0.3243169398907104
Recall: 0.8924812030075189
F1-score: 0.47575150300601204
Confusion Matrix:
                Predicted p Predicted n
Actual p          79.133    9.533
Actual n         164.867   15.467


0.47575150300601204

Sub words with context embedding after spell checking

In [None]:
sub_word_embedding_spell_checked = get_sub_word_tokenization_embedding(input_size, spell_corection=True)

Loading dictionary...
Processing dictionary...
Copied 165258 words to master dictionary...
Copied 1118365 hashes to master dictionary...


In [None]:
print_test_model(*k_cross_validation(classModel, supervised_comments, sub_word_embedding_spell_checked, k=10, epochs=10, balanced_classes=False))

tp: 139, fn: 0, fp: 269, tn: 0
tp: 101, fn: 0, fp: 307, tn: 0


: 

Noise dampening embedding

In [None]:
noise_dampening_embedding = get_noise_dampening_embedding(input_size, device)

In [None]:
print_test_model(*k_cross_validation(classModel, supervised_comments, noise_dampening_embedding, epochs=10, k=10, balanced_classes=False))

tp: 90, fn: 47, fp: 104, tn: 162
tp: 90, fn: 11, fp: 272, tn: 30
tp: 95, fn: 15, fp: 181, tn: 112
tp: 103, fn: 0, fp: 285, tn: 14
tp: 76, fn: 10, fp: 231, tn: 86
tp: 83, fn: 7, fp: 246, tn: 67
tp: 92, fn: 2, fp: 304, tn: 5
tp: 152, fn: 21, fp: 167, tn: 61
tp: 258, fn: 0, fp: 141, tn: 4
tp: 175, fn: 0, fp: 228, tn: 0
tp: 3, fn: 0, fp: 2, tn: 0
Precision: 0.3602723505032564
Recall: 0.9150375939849624
F1-score: 0.5169923534409516
Confusion Matrix:
                Predicted p Predicted n
Actual p          81.133    7.533
Actual n         144.067   36.067


0.5169923534409516