In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from symspellpy import SymSpell
import pkg_resources




class HateSpeechAnalysis:
    def __init__(self, data_path):
        
        self.data = pd.read_csv(data_path)
        self.sym_spell = self.initialize_symspell()
        
        
    def initialize_symspell(self):
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term names in the dictionary text file
        count_index = 1   # column of the term frequencies in the dictionary text file
        if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
            print("Dictionary file not found")
        return sym_spell

    def preprocess_data(self):
        self.data = self.data.sample(n=3000, random_state=42).reset_index(drop=True)
        self.data['Content'] = self.data['Content'].apply(self.remove_html_tags)
        self.data['Content'] = self.data['Content'].apply(self.remove_special_chars_and_digits)
        self.data['Content'] = self.data['Content'].apply(self.correct_spellings)
        

    
    def remove_html_tags(self, text):
        soup = BeautifulSoup(text, 'html.parser')
        return soup.get_text()

    def remove_special_chars_and_digits(self, text):
        text = re.sub(r'[^A-Za-z\s]', '', text)
        return text

    def correct_spellings(self, text):
        suggestions = self.sym_spell.lookup_compound(text, max_edit_distance=2)
        if suggestions:
            return suggestions[0].term
        else:
            return text
    
    

In [141]:

if __name__ == "__main__":
    analysis = HateSpeechAnalysis(data_path='HateSpeechDatasetBalanced.csv')  

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [126]:
# Preprocessing
analysis.preprocess_data()

    