In [42]:
from symspell import SymSpell

import json
from nltk.tokenize import regexp_tokenize
import random
import pandas as pd

Making the symspell dictionary

In [2]:
with open('data/articles_cleaned_vocabulary.json', 'r', encoding="utf-8") as f:
    vocabulary = set([d[0] for d in json.load(f)])

pattern = r'([0-9]+|[а-я]+)'
with open('data/bgjargon_words.json', 'r', encoding="utf-8") as f:
    vocabulary.union(set([token for word in json.load(f) for token in regexp_tokenize(word, pattern)]))


Loading the bad words

In [54]:
with open('data/bad_words_translated.json', 'r', encoding="utf-8") as f:
    bad_words_original = set(json.load(f))

with open('data/bad_words_1.json', 'r', encoding="utf-8") as f:
    bad_words_bgjargon_1 = set(json.load(f))

with open('data/bad_words_2.json', 'r', encoding="utf-8") as f:
    bad_words_bgjargon_2 = set(json.load(f))

with open('data/bad_words_3.json', 'r', encoding="utf-8") as f:
    bad_words_bgjargon_3 = set(json.load(f))

Loading the gold comments

In [40]:
with open('data/blitz_comments_classified.json', 'r', encoding="utf-8") as f:
    gold_comments = [comment_record for comment_record in json.load(f) if len(comment_record) > 2]

gold_comments_p = [c for c in gold_comments if c[2] == 'p']
gold_comments_n = [c for c in gold_comments if c[2] == 'n']

min_len = min(len(gold_comments_p), len(gold_comments_n))

gold_comments_sample = gold_comments_p[:min_len] + gold_comments_n[:min_len]
random.shuffle(gold_comments_sample)

# Classification after correction with symspell

In [56]:
def score_of_word_similarity_annotation_with_symspell_suggestions(max_edit_distance, weight_of_bad, bad_words) -> tuple[float, float]:
    ss = SymSpell(max_dictionary_edit_distance=max_edit_distance)
    for word in vocabulary:
        ss._create_dictionary_entry(word, 1)
    if weight_of_bad:
        for word in bad_words:
            ss._create_dictionary_entry(word, 1)

    def auto_classification_as_p(comment) -> bool:
        for token in ss.lookup_compound(comment[0], max_edit_distance)[0].term.split(" "):
            if token in bad_words:
                return True
        return False

    tp, fn, fp = 0, 0, 0
    for comment in gold_comments_sample:
        if auto_classification_as_p(comment):
            if comment[2] == 'p':
                tp += 1
            else:
                fp += 1
        else:
            if comment[2] == 'p':
                fn += 1
    return tp/(tp + fp), tp/(tp + fn)

In [57]:
max_edit_distance = [1, 2, 3]
weight_of_bad_word = [True]
bad_words_lists = [bad_words_original, bad_words_bgjargon_1, bad_words_bgjargon_2, bad_words_bgjargon_3]
results = {
    "Bad words set no.": [],
    "Max dictionary edit distance": [],
    "More weight of bad words": [],
    "F1": [],
    "Precision": [],
    "Recall": []
}

for w in weight_of_bad_word:
    for e in max_edit_distance:
        for i, b in enumerate(bad_words_lists):
            precision, recall = score_of_word_similarity_annotation_with_symspell_suggestions(e, w, b)
            f1 = 2*precision*recall/(precision + recall)

            print(f"bad words set={i+1} max edit distance={e} weight={w} -> {f1, precision, recall}")
            results["Bad words set no."].append(i+1)
            results["Max dictionary edit distance"].append(e)
            results["More weight of bad words"].append(w)
            results["F1"].append(f1)
            results["Precision"].append(precision)
            results["Recall"].append(recall)
results = pd.DataFrame(results)
results

bad words set=1 max edit distance=1 weight=True -> (0.4352078239608802, 0.967391304347826, 0.2807570977917981)
bad words set=2 max edit distance=1 weight=True -> (0.5427927927927928, 0.9488188976377953, 0.3801261829652997)
bad words set=3 max edit distance=1 weight=True -> (0.5764192139737991, 0.9361702127659575, 0.416403785488959)
bad words set=4 max edit distance=1 weight=True -> (0.575107296137339, 0.8993288590604027, 0.4227129337539432)
bad words set=1 max edit distance=2 weight=True -> (0.4357405140758873, 0.9726775956284153, 0.2807570977917981)
bad words set=2 max edit distance=2 weight=True -> (0.5614035087719299, 0.920863309352518, 0.4037854889589905)
bad words set=3 max edit distance=2 weight=True -> (0.5944798301486198, 0.9090909090909091, 0.4416403785488959)
bad words set=4 max edit distance=2 weight=True -> (0.5898234683281411, 0.8632218844984803, 0.4479495268138801)
bad words set=1 max edit distance=3 weight=True -> (0.45622119815668205, 0.8461538461538461, 0.3123028391167

Unnamed: 0,Bad words set no.,Max dictionary edit distance,More weight of bad words,F1,Precision,Recall
0,1,1,True,0.435208,0.967391,0.280757
1,2,1,True,0.542793,0.948819,0.380126
2,3,1,True,0.576419,0.93617,0.416404
3,4,1,True,0.575107,0.899329,0.422713
4,1,2,True,0.435741,0.972678,0.280757
5,2,2,True,0.561404,0.920863,0.403785
6,3,2,True,0.59448,0.909091,0.44164
7,4,2,True,0.589823,0.863222,0.44795
8,1,3,True,0.456221,0.846154,0.312303
9,2,3,True,0.562105,0.844937,0.421136
