# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Step 0: NLTK setup

In [None]:

try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("NLTK data not found. Downloading...")
    nltk.download('stopwords')
    nltk.download('punkt')
    print("Download complete.")

# Step 1: Data Loading & Preparation

In [None]:
import pandas as pd

print("--- Step 1: Data Loading & Preparation ---")
try:
    data_df = pd.read_csv(r"/content/train (1).csv")
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: CSV file not found. Please check the path.")
    exit()

# Map the `class` column (0,1,2) into binary labels
# 0 = Hate Speech → Offensive
# 1 = Offensive Language → Offensive
# 2 = Neither → Not Offensive
if 'class' not in data_df.columns:
    print("Available columns:", data_df.columns.tolist())
    exit()

data_df['labels'] = data_df['class'].map({
    0: 'Offensive/Hate Speech',
    1: 'Offensive/Hate Speech',
    2: 'Not Offensive'
})

data_df = data_df[['tweet', 'labels']]
print("\nPrepared Data:")
print(data_df.head())

--- Step 1: Data Loading & Preparation ---
Dataset loaded successfully.

Prepared Data:
                                               tweet                 labels
0  !!! RT @mayasolovely: As a woman you shouldn't...          Not Offensive
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  Offensive/Hate Speech
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  Offensive/Hate Speech
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  Offensive/Hate Speech
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  Offensive/Hate Speech


# Step 2: Text Preprocessing

In [None]:
print("\n--- Step 2: Text Preprocessing ---")
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
important_words = {
    "i","am","you","we","he","she","they","my","your",
    "love","like","good","nice","happy","great","friend"
}
filtered_stopwords = stop_words - important_words

def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    cleaned_words = [
        stemmer.stem(word) for word in text.split() if word not in filtered_stopwords
    ]
    return ' '.join(cleaned_words)

data_df['tweet'] = data_df['tweet'].apply(clean_tweet)
print("Text cleaning and stemming complete.")


--- Step 2: Text Preprocessing ---
Text cleaning and stemming complete.


# Step 3: Balance Dataset

In [None]:
print("\n--- Step 3: Balancing Dataset ---")
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

majority = data_df[data_df['labels'] == "Offensive/Hate Speech"]
minority = data_df[data_df['labels'] == "Not Offensive"]

# Upsample minority
minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=len(majority),
                              random_state=42)

balanced_df = pd.concat([majority, minority_upsampled])
print("Balanced class distribution:")
print(balanced_df['labels'].value_counts())

x = balanced_df['tweet']
y = balanced_df['labels']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42, stratify=y
)


--- Step 3: Balancing Dataset ---
Balanced class distribution:
labels
Offensive/Hate Speech    20620
Not Offensive            20620
Name: count, dtype: int64


# Step 4: Train Model

In [None]:
print("\n--- Step 4: Training Model ---")
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.9)),
    ('clf', SGDClassifier(
        loss="log_loss",
        class_weight="balanced",
        random_state=42,
        max_iter=2000,
        tol=1e-4
    ))
])

pipeline.fit(x_train, y_train)
print("Model trained successfully.")


--- Step 4: Training Model ---
Model trained successfully.


# Step 5: Model Evaluation

In [None]:
print("\n--- Step 5: Model Evaluation ---")
from sklearn.metrics import accuracy_score, classification_report
y_pred = pipeline.predict(x_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Step 5: Model Evaluation ---
Accuracy: 0.9528

Classification Report:
                       precision    recall  f1-score   support

        Not Offensive       0.93      0.98      0.95      6186
Offensive/Hate Speech       0.98      0.93      0.95      6186

             accuracy                           0.95     12372
            macro avg       0.95      0.95      0.95     12372
         weighted avg       0.95      0.95      0.95     12372



# Step 6: Saving and loading Model

In [None]:
import pickle

In [None]:
filename = 'hsd_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))

In [None]:
#loading
loaded_model = pickle.load(open('hsd_model.sav', 'rb'))

# Step 7: Interactive Prediction

In [19]:
print("\n--- Step 7: Interactive Prediction ---")
positive_words = {"love", "nice", "good", "happy", "great", "friend", "beautiful"}

def custom_predict(text):
    base_pred = pipeline.predict([text])[0]
    proba = pipeline.predict_proba([text])[0]

    # Positive safeguard
    if any(word in text.lower() for word in positive_words):
        not_off_idx = list(pipeline.classes_).index("Not Offensive")
        if proba[not_off_idx] < 0.6:
            base_pred = "Not Offensive"

    return base_pred, dict(zip(pipeline.classes_, proba))

print("Enter a phrase to check if it's hate speech (type 'quit' to exit):")
while True:
    user_input = input("Enter text: ")
    if user_input.lower() == 'quit':
        break
    pred, probas = custom_predict(user_input)
    print(f"\nPrediction: {pred}")
    print("Confidence Scores:")
    for label, prob in probas.items():
        print(f"  {label}: {prob:.4f}")


--- Step 7: Interactive Prediction ---
Enter a phrase to check if it's hate speech (type 'quit' to exit):
Enter text: i really love to eat fruits

Prediction: Not Offensive
Confidence Scores:
  Not Offensive: 0.6475
  Offensive/Hate Speech: 0.3525
Enter text: quit
