<a href="https://colab.research.google.com/github/Wasifnasim/mini_project_1/blob/main/final_Hate_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate accelerate torch scikit-learn pandas


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import re, json, pandas as pd
from sklearn.model_selection import train_test_split

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df = pd.read_csv("twitter(1).csv")
df['text'] = df['tweet'].astype(str).apply(clean_text)

# map class → labels
label_map = {"0": "Hate Speech", "1": "Offensive Language", "2": "No Hate and Offensive"}
df['label'] = df['class'].astype(int)

with open("label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)

train_df, test_df = train_test_split(df[['text','label']], test_size=0.2, stratify=df['label'], random_state=42)
train_df.to_csv("train_split.csv", index=False)
test_df.to_csv("test_split.csv", index=False)

print("Prepared data → train_split.csv, test_split.csv")


In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3

# Load label map
import json
with open("label_map.json","r") as f:
    raw_map = json.load(f)
id2label = {int(k): v for k,v in raw_map.items()}
label2id = {v: int(k) for k,v in id2label.items()}

# Load splits
train_df = pd.read_csv("train_split.csv")
test_df  = pd.read_csv("test_split.csv")

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_ds = train_ds.rename_column("label", "labels")
test_ds  = test_ds.rename_column("label", "labels")

train_ds.set_format("torch")
test_ds.set_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

trainer.save_model("./saved_model")
tokenizer.save_pretrained("./saved_model")

print("✅ Model saved in ./saved_model")


In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", model="./saved_model", tokenizer="./saved_model", return_all_scores=True)

examples = [
    "I want to kill them all",
    "Have a nice day, friend",
    "You idiot, shut up"
]

for ex in examples:
    preds = pipe(ex)[0]
    print(ex)
    for p in preds:
        print(f"  {p['label']}: {p['score']:.4f}")
    print()


In [None]:
from google.colab import files
!zip -r saved_model.zip saved_model
files.download("saved_model.zip")
