<a href="https://colab.research.google.com/github/Roy-Oindrila/Hate_speech_detection/blob/main/Hate_Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ====================================
# STEP 1 — SETUP KAGGLE API
# ====================================
from google.colab import files
print("Please upload kaggle.json from  Kaggle account...")
files.upload()  # Upload  kaggle.json here

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# ====================================
# STEP 2 — DOWNLOAD DATASET
# ====================================
!kaggle datasets download -d arkhoshghalb/twitter-sentiment-analysis-hatred-speech
!unzip -o twitter-sentiment-analysis-hatred-speech.zip

# ====================================
# STEP 3 — IMPORTS
# ====================================
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import os

# Verify scikit-learn version
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

# ====================================
# STEP 4 — LOAD DATA
# ====================================
df = pd.read_csv('train.csv')
print("Dataset shape:", df.shape)
print("Label distribution:\n", df['label'].value_counts())
df.head()

# ====================================
# STEP 5 — CLEANING FUNCTION
# ====================================
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove digits
    return text

df['clean_tweet'] = df['tweet'].apply(clean_text)

# ====================================
# STEP 6 — TRAIN-TEST SPLIT
# ====================================
X = df['clean_tweet']
y = df['label']  # Assuming 0 = hate speech, 1 = non-hate speech

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ====================================
# STEP 7 — FIT TF-IDF VECTORIZER
# ====================================
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vect = vectorizer.fit_transform(X_train)  # Fit & transform on training
X_test_vect = vectorizer.transform(X_test)       # Transform test

# Verify that vectorizer is fitted
if not hasattr(vectorizer, 'idf_'):
    raise ValueError("TfidfVectorizer is not fitted properly!")
else:
    print("TfidfVectorizer is fitted successfully (has idf_ attribute).")

# ====================================
# STEP 8 — TRAIN MODEL
# ====================================
model = LogisticRegression(max_iter=200, class_weight='balanced')  # Added class_weight for imbalance
model.fit(X_train_vect, y_train)

# ====================================
# STEP 9 — EVALUATE
# ====================================
y_pred = model.predict(X_test_vect)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Debug: Check prediction probabilities
y_pred_proba = model.predict_proba(X_test_vect)
print("\nSample Prediction Probabilities (first 5):")
for i in range(5):
    print(f"Text: {X_test.iloc[i]}, Probabilities: {y_pred_proba[i]}, Predicted: {y_pred[i]}")

# ====================================
# STEP 10 — SAVE TRAINED OBJECTS
# ====================================
# Save model
with open('hate_speech_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Verify that files are saved
if os.path.exists('hate_speech_model.pkl') and os.path.exists('tfidf_vectorizer.pkl'):
    print("✅ Model and vectorizer saved successfully.")
else:
    raise FileNotFoundError("Failed to save model or vectorizer files!")

# Verify vectorizer in saved file
with open('tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
if hasattr(loaded_vectorizer, 'idf_'):
    print("✅ Saved vectorizer is fitted (has idf_ attribute).")
else:
    raise ValueError("Saved vectorizer is NOT fitted!")

# ====================================
# STEP 11 — DOWNLOAD FILES
# ====================================
files.download('hate_speech_model.pkl')
files.download('tfidf_vectorizer.pkl')

Please upload kaggle.json from  Kaggle account...


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/arkhoshghalb/twitter-sentiment-analysis-hatred-speech
License(s): unknown
twitter-sentiment-analysis-hatred-speech.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  twitter-sentiment-analysis-hatred-speech.zip
  inflating: test.csv                
  inflating: train.csv               
scikit-learn version: 1.6.1
Dataset shape: (31962, 3)
Label distribution:
 label
0    29720
1     2242
Name: count, dtype: int64
TfidfVectorizer is fitted successfully (has idf_ attribute).
Confusion Matrix:
[[5654  283]
 [  96  360]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      5937
           1       0.56      0.79      0.66       456

    accuracy                           0.94      6393
   macro avg       0.77      0.87      0.81      6393
weighted avg       0.95      0.94      0.95      6393


S

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>