In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from symspellpy import SymSpell
import pkg_resources
from nltk.corpus import stopwords
import nltk
from bertopic import BERTopic
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

# Download stopwords if you haven't already
nltk.download('stopwords')
# Download required resources for NLTK
nltk.download('punkt')
nltk.download('wordnet')

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

class HateSpeechAnalysis:
    def __init__(self,data_path):
        self.data = pd.read_csv(data_path)
        self.topic_model=BERTopic(language="English")
        self.stop_words = set(stopwords.words('english'))
        self.sym_spell = self.initialize_symspell()
        
        
    def preprocess_test_sentence(self,text, use_spacy=True, use_lemmatization=True, use_stemming=True):
        # Convert to lower case
        text = text.lower()
        
        # Remove punctuation
        pattern = f"[{re.escape(string.punctuation)}]"
        text=re.sub(pattern, '', text)
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]
        
        
        
        if use_stemming:
            # Apply stemming
            tokens = [stemmer.stem(word) for word in tokens]
        
        if use_lemmatization:
            # Using NLTK for lemmatization
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Join tokens back to string
        return ' '.join(tokens)


    def initialize_symspell(self):
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term names in the dictionary text file
        count_index = 1   # column of the term frequencies in the dictionary text file
        if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
            print("Dictionary file not found")
        return sym_spell

    def preprocess_data(self):
        self.data = self.data.sample(n=3000, random_state=42).reset_index(drop=True)
        self.data['Content'] = self.data['Content'].apply(self.remove_html_tags)
        self.data['Content'] = self.data['Content'].apply(self.remove_special_chars_and_digits)
        self.data['Content'] = self.data['Content'].apply(self.correct_spellings)

    
    def remove_html_tags(self, text):
        soup = BeautifulSoup(text, 'html.parser')
        return soup.get_text()

    def remove_special_chars_and_digits(self, text):
        text = re.sub(r'[^A-Za-z\s]', '', text)
        return text

    def correct_spellings(self, text):
        suggestions = self.sym_spell.lookup_compound(text, max_edit_distance=2)
        if suggestions:
            return suggestions[0].term
        else:
            return text
    
    
    def extract_topic_corrected(self):
        sentences=self.data['Content'].apply(self.preprocess_test_sentence)
        self.topic_model.fit(sentences)
        self.topic_model.save("topic-model-for-hate-speech-correct")
        topic_model=BERTopic.load("topic-model-for-hate-speech-correct")
        topics, probs = self.topic_model.transform(sentences)
        # Get the actual topic descriptions
        topic_descriptions = topic_model.get_topic_info()
        # Map topic IDs to their descriptions
        return topics, topic_descriptions
    
        
    def topic_append_with_content(self, topics, topic_desc):
        topic_mapping = {row['Topic']: row['Name'] for _, row in topic_desc.iterrows()}
            # Apply the mapping to get the topic text for each sentence
        self.data['TopicText'] = [topic_mapping[topic] for topic in topics]

        # Combine original text with topic text
        self.data['CombinedText'] = self.data['Content'] + " " + self.data['TopicText']
        pd.options.display.max_colwidth = 200
        print(self.data['CombinedText'])

    def extract_sentence_topic(self, sentence):
        sentence=pd.DataFrame({'Content': [sentence]})
        sentence=sentence['Content'].apply(self.preprocess_test_sentence)
        topic_model=BERTopic.load('topic-model-for-hate-speech-correct')
        topics, probs=topic_model.transform(sentence)
        # Get the actual topic descriptions
        topic_descriptions = topic_model.get_topic_info()

        # Map topic IDs to their descriptions
        topic_mapping = {row['Topic']: row['Name'] for _, row in topic_descriptions.iterrows()}

        #print(topic_mapping)
        #print(topic_mapping.get(topic[0],"Unknown Topic"))
        topic=topic_mapping.get(topics[0],"Unknow_Topic")
        
        return topic

    def preprocess_sentence(self,sentence):
        sentence=pd.DataFrame({'Content':[sentence]})
        sentence['Content'] = sentence['Content'].apply(self.remove_html_tags)
        sentence['Content'] = sentence['Content'].apply(self.remove_special_chars_and_digits)
        sentence['Content'] = sentence['Content'].apply(self.correct_spellings)
        return sentence['Content'].tolist()[0]

    
    
    




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luvkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luvkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luvkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:

if __name__ == "__main__":
    analysis = HateSpeechAnalysis(data_path='HateSpeechDatasetBalanced.csv')  


In [126]:

# Preprocessing
analysis.preprocess_data()

    

In [14]:

topics, topic_desc=analysis.extract_topic_corrected()
analysis.topic_append_with_content(topics=topics, topic_desc=topic_desc)




0                                                                    you should be deeply embarrassed by not fully recognizing how goddamn ignorant you sound and probably you are 0_articl_wikipedia_slut_like
1                                                                                                                        do not make me make you fall in love with a bitch like me 0_articl_wikipedia_slut_like
2                                                         trump america is anti immigrant sexual activity worker trans people insecure for tending hippo shift noah susa aids oral 0_articl_wikipedia_slut_like
3                                                                                                        you guys are clearly a pole smoker please get run me over by a dump truck 0_articl_wikipedia_slut_like
4       of come along the only reason people like strike complain is to get people like me banned so she and his barbara slime can run amok with their barbara the descr

In [4]:
sentence="Please shut the door"
topic=analysis.extract_sentence_topic(sentence=sentence)
print(topic)

0_articl_wikipedia_slut_like
