In [1]:
import json
from distance import Edit_suggester
from preprocessing import preprocess
import random
from nltk.tokenize import regexp_tokenize
import pandas as pd

Creating the dictionary and the edit suggester

In [2]:
with open('data/articles_cleaned_vocabulary.json', 'r', encoding="utf-8") as f:
    vocabulary = set([d[0] for d in json.load(f)])

pattern = r'([0-9]+|[а-я]+)'
with open('data/bgjargon_words.json', 'r', encoding="utf-8") as f:
    bgjargon_dict = set([token for word in json.load(f) for token in regexp_tokenize(word, pattern)])

with open('data/bad_words_translated.json', 'r', encoding="utf-8") as f:
    bad_words_original = set(json.load(f))

with open('data/bad_words_1.json', 'r', encoding="utf-8") as f:
    bad_words_bgjargon_1 = set(json.load(f))

with open('data/bad_words_2.json', 'r', encoding="utf-8") as f:
    bad_words_bgjargon_2 = set(json.load(f))

with open('data/bad_words_3.json', 'r', encoding="utf-8") as f:
    bad_words_bgjargon_3 = set(json.load(f))

edit_suggester = Edit_suggester(2, vocabulary.union(bgjargon_dict.union(bad_words_original)))

Loading the comments

In [3]:
with open('data/blitz_comments_v2.json', 'r', encoding="utf-8") as f:
    comments = json.load(f)

Loading the manually classified comments

In [4]:
with open('data/blitz_comments_classified.json', 'r', encoding="utf-8") as f:
    gold_comments = [comment_record for comment_record in json.load(f) if len(comment_record) > 2]

## Classifying as profane if one of the best suggestions is profane

In [5]:
gold_comments_p = [c for c in gold_comments if c[2] == 'p']
gold_comments_n = [c for c in gold_comments if c[2] == 'n']

min_len = min(len(gold_comments_p), len(gold_comments_n))

gold_comments_sample = gold_comments_p[:min_len] + gold_comments_n[:min_len]
random.shuffle(gold_comments_sample)

In [6]:
def score_of_word_similarity_annotation_with_one_of_suggestions(edit_tolerance, bad_words) -> tuple[float, float]:
    def auto_classification_as_p(comment) -> bool:
        tokens = preprocess(comment[0])
        for token in tokens:
            suggestions = edit_suggester.ngram_all_closest_words(token, 0.25, edit_tolerance)
            for suggestion in suggestions:
                if suggestion in bad_words:
                    return True
        return False

    tp, fn, fp = 0, 0, 0
    for comment in gold_comments_sample:
        if auto_classification_as_p(comment):
            if comment[2] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment[2] == 'p':
                fn += 1
    return tp/(tp + fp), tp/(tp + fn)

In [7]:
edit_tolerance = [0, 1, 2]
bad_words_lists = [bad_words_original, bad_words_bgjargon_1, bad_words_bgjargon_2, bad_words_bgjargon_3]
results = {
        "Bad words set no.": [],
        "Tolerance": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
for i, bad_words in enumerate(bad_words_lists):
    for t in edit_tolerance:
        precision, recall = score_of_word_similarity_annotation_with_one_of_suggestions(t, bad_words)
        f1 = 2*precision*recall/(precision + recall)

        print(f"bad_words_{i+1} tolerance={t} -> {f1, precision, recall}")
        results["Bad words set no."].append(i+1)
        results["Tolerance"].append(t)
        results["F1"].append(f1)
        results["Precision"].append(precision)
        results["Recall"].append(recall)
one_suggestion_results = pd.DataFrame(results)
one_suggestion_results

bad_words_1 tolerance=0 -> (0.3673469387755102, 0.972972972972973, 0.22641509433962265)
bad_words_1 tolerance=1 -> (0.557345971563981, 0.7016706443914081, 0.46226415094339623)
bad_words_1 tolerance=2 -> (0.6153846153846153, 0.5426170468187275, 0.710691823899371)
bad_words_2 tolerance=0 -> (0.5444444444444445, 0.928030303030303, 0.38522012578616355)
bad_words_2 tolerance=1 -> (0.6362515413070283, 0.5233265720081136, 0.8113207547169812)
bad_words_2 tolerance=2 -> (0.6503884572697003, 0.5025728987993139, 0.9213836477987422)
bad_words_3 tolerance=0 -> (0.5834230355220669, 0.9249146757679181, 0.4261006289308176)
bad_words_3 tolerance=1 -> (0.6385033192516597, 0.5181194906953966, 0.8317610062893082)
bad_words_3 tolerance=2 -> (0.6523415977961433, 0.5021204410517388, 0.9308176100628931)
bad_words_4 tolerance=0 -> (0.5850052798310454, 0.8906752411575563, 0.43553459119496857)
bad_words_4 tolerance=1 -> (0.6419161676646706, 0.5183752417794971, 0.8427672955974843)
bad_words_4 tolerance=2 -> (0.65

Unnamed: 0,Bad words set no.,Tolerance,F1,Precision,Recall
0,1,0,0.367347,0.972973,0.226415
1,1,1,0.557346,0.701671,0.462264
2,1,2,0.615385,0.542617,0.710692
3,2,0,0.544444,0.92803,0.38522
4,2,1,0.636252,0.523327,0.811321
5,2,2,0.650388,0.502573,0.921384
6,3,0,0.583423,0.924915,0.426101
7,3,1,0.638503,0.518119,0.831761
8,3,2,0.652342,0.50212,0.930818
9,4,0,0.585005,0.890675,0.435535


In [8]:
def score_of_word_similarity_annotation_with_one_of_suggestions(edit_tolerance, bad_words) -> tuple[float, float]:
    def auto_classification_as_p(comment) -> bool:
        tokens = preprocess(comment[0], aggressive=True)
        for token in tokens:
            suggestions = edit_suggester.ngram_all_closest_words(token, 0.25, edit_tolerance)
            for suggestion in suggestions:
                if suggestion in bad_words:
                    return True
        return False

    tp, fn, fp = 0, 0, 0
    for comment in gold_comments_sample:
        if auto_classification_as_p(comment):
            if comment[2] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment[2] == 'p':
                fn += 1
    return tp/(tp + fp), tp/(tp + fn)

In [9]:
edit_tolerance = [0]
bad_words_lists = [bad_words_original, bad_words_bgjargon_1, bad_words_bgjargon_2, bad_words_bgjargon_3]
results = {
        "Bad words set no.": [],
        "Tolerance": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
for i, bad_words in enumerate(bad_words_lists):
    for t in edit_tolerance:
        precision, recall = score_of_word_similarity_annotation_with_one_of_suggestions(t, bad_words)
        f1 = 2*precision*recall/(precision + recall)

        print(f"bad_words_{i+1} tolerance={t} -> {f1, precision, recall}")
        results["Bad words set no."].append(i+1)
        results["Tolerance"].append(t)
        results["F1"].append(f1)
        results["Precision"].append(precision)
        results["Recall"].append(recall)
one_suggestion_results_aggressive_preprocessing = pd.DataFrame(results)
one_suggestion_results_aggressive_preprocessing

bad_words_1 tolerance=0 -> (0.36153846153846153, 0.9791666666666666, 0.22169811320754718)
bad_words_2 tolerance=0 -> (0.5478547854785479, 0.9120879120879121, 0.3915094339622642)
bad_words_3 tolerance=0 -> (0.5857294994675187, 0.9075907590759076, 0.43238993710691825)
bad_words_4 tolerance=0 -> (0.5881126173096975, 0.8730650154798761, 0.44339622641509435)


Unnamed: 0,Bad words set no.,Tolerance,F1,Precision,Recall
0,1,0,0.361538,0.979167,0.221698
1,2,0,0.547855,0.912088,0.391509
2,3,0,0.585729,0.907591,0.43239
3,4,0,0.588113,0.873065,0.443396


Finding more about the errors:

In [12]:
bad_words = bad_words_bgjargon_2

false_positives = {
    'Comment': [],
    'Original Token': [],
    'Corrected Token': []
}
true_positives = {
    'Comment': [],
    'Original Token': [],
    'Corrected Token': []
}
false_negatives = []
true_negatives = []

def find_positives(comment):
    tokens = preprocess(comment[0], aggressive=True)
    for token in tokens:
        suggestions = edit_suggester.ngram_all_closest_words(token, 0.25)
        for suggestion in suggestions:
            if suggestion in bad_words:
                return True, token, suggestion
    return False, "", ""

for comment in gold_comments_sample:
    positive, token, suggestion = find_positives(comment)
    if positive:
        if comment[2] == 'p':
            true_positives['Comment'].append(comment)
            true_positives['Original Token'].append(token)
            true_positives['Corrected Token'].append(suggestion)
        else:
            false_positives['Comment'].append(comment)
            false_positives['Original Token'].append(token)
            false_positives['Corrected Token'].append(suggestion)
    else:
        if comment[2] == 'p':
            false_negatives.append(comment)
        else:
            true_negatives.append(comment)

false_positives = pd.DataFrame(false_positives)
true_positives = pd.DataFrame(true_positives)
false_negatives = pd.DataFrame(false_negatives)
true_negatives = pd.DataFrame(true_negatives)

In [13]:
false_positives

Unnamed: 0,Comment,Original Token,Corrected Token
0,[Защитникът на жените много му се иска на фина...,тъпанарките,тъпанарка
1,"[Па като нема кво да яде, да заповяда на село....",яйчица,кучица
2,"[Невероятно е,колко семпло разсъждава иначе пр...",сащсамо,сосам
3,"[Бати и вуко, Дзвер, n]",бати,бати
4,"[Хахах...ми прав е пича!, непознат, n]",хахахми,хамами
5,"[Тоз пък!\nЗа какъв се има, че квалифицира ког...",топки,топки
6,"[Да,бе,не вярвам!, Теди, n]",дабене,дъбене
7,"[Добре изглеждаща жена, с наследствена дарба -...",едиповия,дедовия
8,"[Пръъъъц!, въх,, n]",пръъъъц,пръц
9,"[Ха ха,кой в днешно време сключва брак?То по л...",хакой,хакам


In [16]:
false_negatives[50:60]

Unnamed: 0,0,1,2
50,"Все се уреждат!И все им върви,пари,деца екскур...",К....,p
51,"Ба, кой ше го отвлече тоз дътгел бе.",Бай Ху,p
52,Тясна и е цепката,Дядо Поплювко,p
53,Момичето произвежда страхотна музика и всички ...,Александър Ангелов,p
54,"Под къси панталонки, се крие късо пишле..... н...",Къси панталонки,p
55,Вчера цънцата яко близа дирника на деса гробарска,Фикри,p
56,Тая пача още докато беше жив И. П. се разпаряш...,Яздач,p
57,"Гнусен педераст. Не бих се ръкувал с човек, ко...",111,p
58,КАТО ГЛЕДАМ КАКВО И КОМЕНТИРАТЕ ФЕНОВЕТЕ И МАЙ...,ХУИ ЧО,p
59,с Катето се челастрим яко докато Сашето е в Ан...,янци,p


Auto annotation

In [None]:
from preprocessing import preprocess

def has_profanity(comment: dict[str, str]) -> bool:
    tokens = preprocess(comment['comment'])
    for token in tokens:
        spellcheck = edit_suggester.ngram_spellcheck(token, jaccard_threshold=0.3)
        if spellcheck != 'None' and spellcheck in bad_words:
            return True
    return False

auto_annotated_comments = []
for comment in comments:
    if has_profanity(comment):
        auto_annotated_comments.append({
            'comment': comment['comment'],
            'author': comment['author'],
            'class': 'p'
            })
    else:
        auto_annotated_comments.append({
            'comment': comment['comment'],
            'author': comment['author'],
            'class': 'n'
            })

In [9]:
import codecs

json_object = json.dumps(auto_annotated_comments, indent=4, ensure_ascii=False)
with codecs.open("data/blitz_comments_auto_classified.json", "w", "utf-8") as outfile:
    outfile.write(json_object)