In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Download stopwords once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [4]:
# --- Preprocessing function ---
def preprocess(text):
    text = text.lower()
    text = re.sub(r'rt\s+', '', text)                 # remove 'RT'
    text = re.sub(r'http\S+|www\S+', ' url ', text)   # replace urls
    text = re.sub(r'@\w+', ' user ', text)            # replace mentions
    text = re.sub(r'[^a-z\s]', '', text)              # keep only letters and spaces
    tokens = [word for word in text.split() if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

In [5]:
# Load dataset
df = pd.read_csv("data/labeled_data.csv")
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [6]:
# --- Apply preprocessing ---
df['clean_tweet'] = df['tweet'].astype(str).apply(preprocess)
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,clean_tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,user woman shouldnt complain clean hous amp ma...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,user boy dat coldtyga dwn bad cuffin dat hoe s...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,user dawg user ever fuck bitch stato cri confu...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,user user look like tranni
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,user shit hear might true might faker bitch to...
...,...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,you muthafin lie user user user right tl trash...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...",youv gone broke wrong heababi drove redneck crazi
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like aint fuck...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies,youu got wild bitch tellin lie


In [7]:
# --- Features and labels ---
X = df['clean_tweet']
y = df['class']

In [8]:
# --- Bag of Words representation ---
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

In [9]:
# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42
)

In [10]:
# --- Train a simple classifier (Naive Bayes) ---
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [11]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# --- Evaluation ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nDetailed classification report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8961065160379261

Detailed classification report:

              precision    recall  f1-score   support

           0       0.49      0.24      0.33       290
           1       0.92      0.95      0.94      3832
           2       0.83      0.87      0.85       835

    accuracy                           0.90      4957
   macro avg       0.75      0.69      0.71      4957
weighted avg       0.88      0.90      0.89      4957



# 📊 Text Classification Results (Bag of Words + Logistic Regression)

### ✅ Overall Accuracy
- **Accuracy**: `0.8961` (~89.6%)  
- Strong overall performance, but accuracy alone doesn’t tell the full story because of class imbalance.  

---

### 🔎 Class-wise Performance

| Class | Label Description    | Precision | Recall | F1-score | Support |
|-------|----------------------|-----------|--------|----------|---------|
| **0** | Hate Speech          | 0.49      | 0.24   | 0.33     | 290     |
| **1** | Offensive Language   | 0.92      | 0.95   | 0.94     | 3832    |
| **2** | Neither (Neutral)    | 0.83      | 0.87   | 0.85     | 835     |

---

### 📝 Observations
- **Class 0 (Hate Speech)**: Weakest performance (low recall of 0.24). The model misses many hate speech examples due to class imbalance.  
- **Class 1 (Offensive Language)**: Excellent detection (precision 0.92, recall 0.95). Model strongly favors this dominant class.  
- **Class 2 (Neutral)**: Balanced performance with solid precision and recall.  

---

### 📊 Averages
- **Macro Avg (equal weight across classes)**: Precision = 0.75, Recall = 0.69, F1 = 0.71 → shows uneven performance.  
- **Weighted Avg (accounts for class size)**: Precision = 0.88, Recall = 0.90, F1 = 0.89 → skewed by the majority class.  

---

### ⚖️ Interpretation
- The model is **very reliable** for offensive and neutral tweets.  
- It is **weak at detecting hate speech**, the most sensitive but least represented class.  
- Class imbalance is the main challenge, not the model itself.  

---

### 🚀 Recommendations
1. **Handle imbalance**: Oversampling, undersampling, or SMOTE for hate speech class.  
2. **Use class weights**: Set `class_weight='balanced'` in Logistic Regression.  
3. **Improve features**: Switch from Bag-of-Words to **TF-IDF** or embeddings.  
4. **Threshold tuning**: Adjust probability cutoffs to boost recall for hate speech.  


In [12]:
# Save model and vectorizer
import pickle
import os

# Create directories if they don't exist
os.makedirs("models", exist_ok=True)
os.makedirs("vectorizer", exist_ok=True)

with open("models/logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer/count_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)