In [1]:
# Machine Learning Packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import resample

# Natural Language Processing (NLP) Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import re

# Visualization Libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from IPython.display import display_html
from IPython.display import display, HTML
from sklearn.tree import plot_tree
import plotly.express as px

# Data Manipulation and Processing
import pandas as pd
import numpy as np
from collections import Counter
from tabulate import tabulate
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import time
import sys
import os
import pickle
from pympler import asizeof
from imblearn.pipeline import Pipeline
from scipy.sparse import csr_matrix

from sklearn.exceptions import ConvergenceWarning
import warnings
pd.options.mode.chained_assignment = None


In [2]:
df = pd.read_csv('train.csv')

class_labels = {0: 'Hate Speech', 1: 'Offensive Language', 2: 'Neither'}
class_counts = df['class'].value_counts()
df['labels'] = df['class'].map(class_labels)

df.isnull().sum()

df['class'] = df['class'].apply(lambda x: 0 if x in [0, 1] else 1)

class_labels = {0: 'negative', 1: 'positive'}
df['labels'] = df['class'].map(class_labels)

stop_words = stopwords.words('english')
stop_words.remove('not')  # Keeping 'not' as it is important in negation (Can change the meaning of the sentence)

def clean(text):
    # Remove 'RT' (Retweet indicator)
    text = re.sub(r'\bRT\b', '', text)
    # Remove URLs 
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove Twitter handles 
    text = re.sub(r'@\w+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    words = text.split()
    # Removing the stop words
    words = [word for word in words if word not in stop_words]
    # Rejoin the words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text

df.loc[:, 'cleaned_tweet'] = df['tweet'].apply(clean)

tweet_length = df['cleaned_tweet'].apply(len)
df['tweet_length'] = df['cleaned_tweet'].apply(len)
Q1 = tweet_length.quantile(0.25)
Q3 = tweet_length.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(tweet_length >= lower_bound) & (tweet_length <= upper_bound)]

df.drop(['tweet', 'tweet_length'], axis=1, inplace=True)

duplicate_rows = df[df.duplicated()]

df = df.drop_duplicates()
df = df.reset_index(drop=True)
df = df.dropna()

df.to_csv('cleaned_dataset.csv', index=False)

In [3]:
lemmatizer = WordNetLemmatizer()

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('vader_lexicon')

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token, get_wordnet_pos(token))
        lemmatized_tokens.append(token)  
        if lemma != token:
            lemmatized_tokens.append(lemma)  
    return " ".join(lemmatized_tokens)

final_df = pd.read_csv('cleaned_dataset.csv')
final_df = final_df.dropna()
final_df['cleaned_tweet'] = final_df['cleaned_tweet'].apply(lemmatize_text) 

df_majority=final_df[(final_df['class']==0)] 
df_minority=final_df[(final_df['class']==1)] 
df_majority_undersample=resample(df_majority, replace=False, n_samples=df_minority.shape[0], random_state=42)
df_balanced = pd.concat([df_majority_undersample, df_minority])

sia = SentimentIntensityAnalyzer()
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

sia_booster = sia.constants.BOOSTER_DICT
if 'fucking' in sia_booster:
    sia_booster.pop('fucking')
    print("Word 'fucking' DELETED from VADER")
sia.lexicon['fucking'] = -2.0

df_balanced['sentiment_score'] = df_balanced['cleaned_tweet'].apply(get_sentiment)
df_balanced.drop(['count', 'hate_speech_count', 'offensive_language_count', 'neither_count'], axis=1, inplace=True)
df_balanced = df_balanced[df_balanced['cleaned_tweet'].str.strip() != '']
df_balanced.to_csv('train_balanced.csv', index=False)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/rajvarun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rajvarun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rajvarun/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Word 'fucking' DELETED from VADER


In [4]:
def adjust_word_scores(sentence):
    words = sentence.lower().split()
    sentence_score = sia.polarity_scores(sentence)['compound']
    word_scores = [sia.polarity_scores(w)['compound'] for w in words]
    
    adjusted_scores = []
    for word_score in word_scores:
        if sentence_score > 0:
            if word_score > 0:
                boosted = word_score * 3
                if boosted < 0.1: 
                    boosted = 0.1
                adjusted_scores.append(boosted)
            elif word_score < 0:
                more_neg = word_score * 1  
                adjusted_scores.append(more_neg)
            else:
                adjusted_scores.append(0.0)
        else:
            if word_score < 0:
                more_neg = word_score * 3 
                if more_neg > -0.1: 
                    more_neg = -0.1
                adjusted_scores.append(more_neg)
            elif word_score > 0:
                suppressed = word_score * 0.2
                adjusted_scores.append(suppressed)
            else:
                adjusted_scores.append(0.0)
                
    return np.array(adjusted_scores) if len(adjusted_scores) > 0 else np.array([0.0])


df_balanced = pd.read_csv("train_balanced.csv")
df_balanced['adjusted_word_scores'] = df_balanced['cleaned_tweet'].apply(adjust_word_scores)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_balanced['cleaned_tweet'])
X = csr_matrix(X)

X_fused = X.copy()
feature_names = vectorizer.get_feature_names_out().tolist()
row_indices, col_indices = X.nonzero()

for row, col in zip(row_indices, col_indices):
    word = feature_names[col]
    words_in_sentence = df_balanced['cleaned_tweet'].iloc[row].lower().split()
    if word in words_in_sentence:
        word_idx = words_in_sentence.index(word)
        try:
            score = df_balanced['adjusted_word_scores'].iloc[row][word_idx]
            X_fused[row, col] *= (1 + score)
        except IndexError:
            pass

y = df_balanced['class']
X_train, X_test, y_train, y_test = train_test_split(X_fused, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split= 2, n_estimators= 300)
model.fit(X_train, y_train)

print("Train Accuracy:", model.score(X_train, y_train))
print("Test Accuracy:", model.score(X_test, y_test))

test_texts = ["He is so fucking annoying", "I am so fucking happy for you"]
df_test = pd.DataFrame({"text": test_texts})
df_test["adjusted_word_scores"] = df_test["text"].apply(adjust_word_scores)

X_test_vec = vectorizer.transform(df_test["text"])
X_test_vec = csr_matrix(X_test_vec)
X_test_fused = X_test_vec.copy()

row_indices, col_indices = X_test_vec.nonzero()
for row, col in zip(row_indices, col_indices):
    word = feature_names[col]
    words_in_sentence = df_test["text"].iloc[row].lower().split()
    if word in words_in_sentence:
        idx = words_in_sentence.index(word)
        try:
            score = df_test['adjusted_word_scores'].iloc[row][idx]
            X_test_fused[row, col] *= (1 + score)
        except IndexError:
            pass

preds = model.predict(X_test_fused)
print("Predictions:", preds)

Train Accuracy: 0.9995378927911276
Test Accuracy: 0.9532019704433498
Predictions: [0 1]
