## Step 1: Load and Explore the Dataset

First, let's load the dataset and see what we're working with.

In [2]:
import pandas as pd
import numpy as np

# Set display options
pd.set_option('display.max_columns', None)

# Load the dataset
df = pd.read_csv('../data/hate_speech_detection.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

Dataset Shape: (24783, 7)

First few rows:


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
# Dataset Overview
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nClass Distribution:")
print(df['class'].value_counts())
print("\nClass Labels:")
print("0: Hate Speech")
print("1: Offensive Language")
print("2: Neither")

Dataset Shape: (24783, 7)

Column Names: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']

Data Types:
Unnamed: 0             int64
count                  int64
hate_speech            int64
offensive_language     int64
neither                int64
class                  int64
tweet                 object
dtype: object

Missing Values:
Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

Class Distribution:
class
1    19190
2     4163
0     1430
Name: count, dtype: int64

Class Labels:
0: Hate Speech
1: Offensive Language
2: Neither


In [4]:
# Sample tweets from each class
print("=== HATE SPEECH EXAMPLES (Class 0) ===")
hate_speech_samples = df[df['class'] == 0].head(3)
for idx, row in hate_speech_samples.iterrows():
    print(f"\n{row['tweet']}")

print("\n\n=== OFFENSIVE LANGUAGE EXAMPLES (Class 1) ===")
offensive_samples = df[df['class'] == 1].head(3)
for idx, row in offensive_samples.iterrows():
    print(f"\n{row['tweet']}")

print("\n\n=== NEITHER EXAMPLES (Class 2) ===")
neither_samples = df[df['class'] == 2].head(3)
for idx, row in neither_samples.iterrows():
    print(f"\n{row['tweet']}")

=== HATE SPEECH EXAMPLES (Class 0) ===

"@Blackman38Tide: @WhaleLookyHere @HowdyDowdy11 queer" gaywad

"@CB_Baby24: @white_thunduh alsarabsss" hes a beaner smh you can tell hes a mexican

"@DevilGrimz: @VigxRArts you're fucking gay, blacklisted hoe" Holding out for #TehGodClan anyway http://t.co/xUCcwoetmn


=== OFFENSIVE LANGUAGE EXAMPLES (Class 1) ===

!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!

!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit

!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny


=== NEITHER EXAMPLES (Class 2) ===

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...

" momma said no pussy cats inside my doghouse "

"@Addicted2Guys: -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF" woof woof hot scally lad


In [5]:
# Text Preprocessing
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text):
    """
    Clean and preprocess tweet text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing
print("Preprocessing tweets...")
df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)
print("Preprocessing complete!")

ModuleNotFoundError: No module named 'nltk'