In [1]:
!pip install datasets



In [2]:
!pip install pandas



In [10]:
from datasets import load_dataset
import pandas as pd

In [11]:
rawdataset = load_dataset("tdavidson/hate_speech_offensive")

In [12]:
rawdataset['train']

Dataset({
    features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet'],
    num_rows: 24783
})

**Now to convert the the rawdataset to be pandas compatible**

In [13]:
dataset = rawdataset['train'].to_pandas()

In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   count                     24783 non-null  int64 
 1   hate_speech_count         24783 non-null  int64 
 2   offensive_language_count  24783 non-null  int64 
 3   neither_count             24783 non-null  int64 
 4   class                     24783 non-null  int64 
 5   tweet                     24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


**Performing Datacleaning**

In [16]:
dataset = dataset[['tweet', 'class']]

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   24783 non-null  object
 1   class   24783 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


In [18]:
dataset = dataset.drop_duplicates(subset='tweet')

In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   24783 non-null  object
 1   class   24783 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


For removing spaces from infront or behind of the tweets

In [20]:
dataset['tweet'] = dataset['tweet'].astype(str).str.strip()

In [21]:
dataset.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


Dropping empty rows

In [22]:
dataset.dropna()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1
...,...,...
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1
24779,"you've gone and broke the wrong heart baby, an...",2
24780,young buck wanna eat!!.. dat nigguh like I ain...,1
24781,youu got wild bitches tellin you lies,1


In [28]:
dataset.head(30)

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1
5,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just...",1
6,"!!!!!!""@__BrighterDays: I can not just sit up ...",1
7,!!!!&#8220;@selfiequeenbri: cause I'm tired of...,1
8,""" &amp; you might not get ya bitch back &amp; ...",1
9,""" @rhythmixx_ :hobbies include: fighting Maria...",1


**Installing NLTK for extensive text preprocessing and tokenizing and lemetization**

In [31]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp312-cp312-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------- ----- 1.3/1.5 MB 9.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 10.0 MB/s  0:00:00
Downloading regex-2025.11.3-cp312-cp312-win_amd64.whl (277 kB)
Downloading joblib-1.5.3-py3-none-any.whl (309 kB)
Installing collected packages: regex, joblib, nltk

   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
   ------------- -------------------------- 1/3 [joblib]
 

In [47]:
import re, html
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avase\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\avase\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\avase\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avase\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\avase\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [49]:
def clean_text(raw_tweet):
    text = html.unescape(str(raw_tweet))
    text = text.lower()
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"@\w*", " ", text)      # remove usernames and @
    text = re.sub(r"http\S+", " ", text)   # to remove URLs
    text = re.sub(r"[^a-z ]", " ", text)

    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    clean_tokens = []
    for word, tag in tagged:
        if word not in stop_words and len(word) > 1:
            wn_tag = get_wordnet_pos(tag)
            root = lemmatizer.lemmatize(word, pos=wn_tag)
            clean_tokens.append(root)

    return " ".join(clean_tokens)

In [50]:
dataset['processed_text'] = dataset['tweet'].apply(clean_text)
dataset = dataset[dataset['processed_text'].str.strip() != ""]

In [53]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


**Train Test Split and TF- IDF Vectorization**

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_text = dataset['processed_text'] 
y = dataset['class']                  
#Class 0 for Hate Speech
#Class 1 for offensive Language
#Class 2 for neither

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

**Using SVM model**

In [59]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

acc_svm = accuracy_score(y_test, y_pred_svm)
print("SVM accuracy:", acc_svm)
print("SVM report:\n", classification_report(y_test, y_pred_svm))
print("SVM confusion matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM accuracy: 0.9138591890256204
SVM report:
               precision    recall  f1-score   support

           0       0.60      0.16      0.26       286
           1       0.93      0.97      0.95      3838
           2       0.86      0.92      0.89       833

    accuracy                           0.91      4957
   macro avg       0.80      0.68      0.70      4957
weighted avg       0.90      0.91      0.90      4957

SVM confusion matrix:
 [[  47  213   26]
 [  27 3716   95]
 [   4   62  767]]


**Training Using Random Forest**

In [60]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print("RF accuracy:", acc_rf)
print("RF report:\n", classification_report(y_test, y_pred_rf))
print("RF confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))

RF accuracy: 0.9053863223724027
RF report:
               precision    recall  f1-score   support

           0       0.63      0.17      0.27       286
           1       0.93      0.96      0.94      3838
           2       0.83      0.90      0.86       833

    accuracy                           0.91      4957
   macro avg       0.80      0.68      0.69      4957
weighted avg       0.89      0.91      0.89      4957

RF confusion matrix:
 [[  50  208   28]
 [  25 3691  122]
 [   4   82  747]]


**HyperTuning Random Forest Classifier**

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_base = RandomForestClassifier(
    random_state=42,
    n_jobs=-1   #in order to use all the cores of my CPU
)

param_grid = {
    'n_estimators': [200, 400, 600],
    'max_depth': [None, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [66]:
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,          
    n_jobs=-1,     # for using all CPU cores
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits




Best params: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600}
Best CV accuracy: 0.9038539144471347


In [67]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

best_rf = grid_search.best_estimator_

y_pred_best_rf = best_rf.predict(X_test)

acc_best_rf = accuracy_score(y_test, y_pred_best_rf)
print("Tuned RF test accuracy:", acc_best_rf)
print("Tuned RF report:\n", classification_report(y_test, y_pred_best_rf))
print("Tuned RF confusion matrix:\n", confusion_matrix(y_test, y_pred_best_rf))

Tuned RF test accuracy: 0.9047811176114585
Tuned RF report:
               precision    recall  f1-score   support

           0       0.63      0.16      0.26       286
           1       0.93      0.96      0.94      3838
           2       0.83      0.89      0.86       833

    accuracy                           0.90      4957
   macro avg       0.80      0.67      0.69      4957
weighted avg       0.89      0.90      0.89      4957

Tuned RF confusion matrix:
 [[  47  211   28]
 [  24 3695  119]
 [   4   86  743]]


**Predictions made by model**

In [68]:
import numpy as np

label_map = {
    0: "Hate Speech",
    1: "Offensive Language",
    2: "Neither"
}

def predict_with_svm(text):
    # For Cleaning the Raw text
    cleaned = clean_text(text)
    # For Vectorization
    X_vec = vectorizer.transform([cleaned])
    # Predicting Labels
    pred_label = svm_model.predict(X_vec)[0]
    # Label it to human readable code
    return pred_label, label_map[int(pred_label)]

def predict_with_rf(text):
    cleaned = clean_text(text)
    X_vec = vectorizer.transform([cleaned])
    pred_label = best_rf.predict(X_vec)[0]
    return pred_label, label_map[int(pred_label)]

In [102]:
test_text = "Fortnite is absolutely shit"

svm_code, svm_label = predict_with_svm(test_text)
rf_code, rf_label  = predict_with_rf(test_text)

print("Input text:", test_text)
print("SVM prediction:", svm_code, "->", svm_label)
print("RF prediction:", rf_code, "->", rf_label)

Input text: Fortnite is absolutely shit
SVM prediction: 1 -> Offensive Language
RF prediction: 1 -> Offensive Language
