## Step 1: Load and Explore the Dataset

First, let's load the dataset and see what we're working with.

In [1]:
import pandas as pd
import numpy as np

# Set display options
pd.set_option('display.max_columns', None)

# Load the dataset
df = pd.read_csv('../data/hate_speech_detection.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

Dataset Shape: (24783, 7)

First few rows:


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [2]:
# Dataset Overview
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nClass Distribution:")
print(df['class'].value_counts())
print("\nClass Labels:")
print("0: Hate Speech")
print("1: Offensive Language")
print("2: Neither")

Dataset Shape: (24783, 7)

Column Names: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']

Data Types:
Unnamed: 0             int64
count                  int64
hate_speech            int64
offensive_language     int64
neither                int64
class                  int64
tweet                 object
dtype: object

Missing Values:
Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

Class Distribution:
class
1    19190
2     4163
0     1430
Name: count, dtype: int64

Class Labels:
0: Hate Speech
1: Offensive Language
2: Neither


In [3]:
# Sample tweets from each class
print("=== HATE SPEECH EXAMPLES (Class 0) ===")
hate_speech_samples = df[df['class'] == 0].head(3)
for idx, row in hate_speech_samples.iterrows():
    print(f"\n{row['tweet']}")

print("\n\n=== OFFENSIVE LANGUAGE EXAMPLES (Class 1) ===")
offensive_samples = df[df['class'] == 1].head(3)
for idx, row in offensive_samples.iterrows():
    print(f"\n{row['tweet']}")

print("\n\n=== NEITHER EXAMPLES (Class 2) ===")
neither_samples = df[df['class'] == 2].head(3)
for idx, row in neither_samples.iterrows():
    print(f"\n{row['tweet']}")

=== HATE SPEECH EXAMPLES (Class 0) ===

"@Blackman38Tide: @WhaleLookyHere @HowdyDowdy11 queer" gaywad

"@CB_Baby24: @white_thunduh alsarabsss" hes a beaner smh you can tell hes a mexican

"@DevilGrimz: @VigxRArts you're fucking gay, blacklisted hoe" Holding out for #TehGodClan anyway http://t.co/xUCcwoetmn


=== OFFENSIVE LANGUAGE EXAMPLES (Class 1) ===

!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!

!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit

!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny


=== NEITHER EXAMPLES (Class 2) ===

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...

" momma said no pussy cats inside my doghouse "

"@Addicted2Guys: -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF" woof woof hot scally lad


In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\subis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [6]:
# Text Preprocessing
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text):
    """
    Clean and preprocess tweet text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing
print("Preprocessing tweets...")
df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)
print("Preprocessing complete!")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\subis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\subis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\subis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\subis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Preprocessing tweets...
Preprocessing complete!


In [7]:
# Check preprocessing results
print("Original vs Cleaned Tweets:\n")
for i in range(5):
    print(f"Original: {df['tweet'].iloc[i]}")
    print(f"Cleaned:  {df['cleaned_tweet'].iloc[i]}")
    print("-" * 80)

Original vs Cleaned Tweets:

Original: !!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
Cleaned:  rt woman shouldnt complain cleaning house amp man always take trash
--------------------------------------------------------------------------------
Original: !!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
Cleaned:  rt boy dat coldtyga dwn bad cuffin dat hoe st place
--------------------------------------------------------------------------------
Original: !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
Cleaned:  rt dawg rt ever fuck bitch start cry confused shit
--------------------------------------------------------------------------------
Original: !!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
Cleaned:  rt look like tranny
----------------------------------------------

In [9]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X = df['cleaned_tweet']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())

Training set size: 19826
Test set size: 4957

Class distribution in training set:
class
1    15352
2     3330
0     1144
Name: count, dtype: int64


In [10]:
# Feature Extraction using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.7,
    ngram_range=(1, 2)
)

# Fit and transform training data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

TF-IDF matrix shape: (19826, 5000)
Number of features: 5000


In [11]:
# Train Logistic Regression with class weights to handle imbalance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Train model with class weights to handle imbalance
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)

print("Training model...")
model.fit(X_train_tfidf, y_train)
print("Training complete!")

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate model
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive', 'Neither']))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

Training model...
Training complete!

=== Classification Report ===
              precision    recall  f1-score   support

 Hate Speech       0.31      0.61      0.41       286
   Offensive       0.97      0.85      0.90      3838
     Neither       0.75      0.95      0.84       833

    accuracy                           0.85      4957
   macro avg       0.68      0.80      0.72      4957
weighted avg       0.89      0.85      0.86      4957


=== Confusion Matrix ===
[[ 174   82   30]
 [ 366 3244  228]
 [  16   28  789]]


In [12]:
# Test the saved models by loading them
import pickle

# Load the model
with open('../src/hate_speech_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Load the vectorizer
with open('../src/tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Test with a sample text
test_text = "you are amazing and wonderful"
test_cleaned = preprocess_text(test_text)
test_vectorized = loaded_vectorizer.transform([test_cleaned])
prediction = loaded_model.predict(test_vectorized)[0]

class_names = {0: 'Hate Speech', 1: 'Offensive Language', 2: 'Neither'}
print(f"Test text: '{test_text}'")
print(f"Prediction: {class_names[prediction]}")
print("\nModels saved and tested successfully!")

FileNotFoundError: [Errno 2] No such file or directory: '../models/hate_speech_model.pkl'