<a href="https://colab.research.google.com/github/Siddharthsuresh19/Body_Fluid_cell_classification/blob/main/Nlp_Project_Hate_Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Increased accuracy (around 88%) - Complete one
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load dataset
df = pd.read_csv("dataset_updated.csv", header=1)

# Drop missing values and filter for relevant labels
df.dropna(inplace=True)
df = df[df['label'].isin(['Hate-Speech', 'Non-Hate-Speech'])]

# Encode labels (1 = Hate, 0 = Non-Hate)
df['label_encoded'] = df['label'].apply(lambda x: 1 if x == 'Hate-Speech' else 0)

df = df.reset_index(drop=True)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label_encoded'].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer and encode data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Create a custom dataset class
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self,):
        return len(self.labels)

train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Define custom metrics function
def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1_score': f1}

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=[]
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()
print("Evaluation results:", evaluation_results)

# Save the model and tokenizer
model.save_pretrained("./bert-tamil-tanglish-hate-speech-final")
tokenizer.save_pretrained("./bert-tamil-tanglish-hate-speech-final")

# Define a prediction function
def predict_batch(texts):
    # Ensure the model and input are on the correct device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**encodings)
    preds = torch.argmax(outputs.logits, dim=1).tolist()
    return ["Hate-Speech" if p == 1 else "Non-Hate-Speech" for p in preds]

# Example prediction
new_samples = [
    "நீ ஒரு மோசமான மனிதன்",
    "உன்னை சுட்டுவேன்",
    "நீ நல்லவங்க",
    "அவன் ஒரு பைத்தியம்",
    "Nee oru loosu payala da",
    "Veliya poi sethudu da dai",
    "Ivan madiri pasangala sethutanum",
    "Enna mokka thalaiya da nee",
    "I am a good person",
    "Dai thevidiya paiya",
    "Dai poda punda",
    "Ivan oru loose",
]
predictions = predict_batch(new_samples)
for text, pred in zip(new_samples, predictions):
    print(f"Text: '{text}' -> Prediction: {pred}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.3817,0.333628,0.8386,0.836218
2,0.3039,0.317413,0.859,0.858773
3,0.2525,0.304043,0.873,0.872729


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.3817,0.333628,0.8386,0.836218
2,0.3039,0.317413,0.859,0.858773
3,0.2525,0.304043,0.873,0.872729


Evaluation results: {'eval_loss': 0.30404263734817505, 'eval_accuracy': 0.873, 'eval_f1_score': 0.8727292717815475, 'eval_runtime': 37.5337, 'eval_samples_per_second': 133.214, 'eval_steps_per_second': 8.339, 'epoch': 3.0}
Text: 'நீ ஒரு மோசமான மனிதன்' -> Prediction: Non-Hate-Speech
Text: 'உன்னை சுட்டுவேன்' -> Prediction: Hate-Speech
Text: 'நீ நல்லவங்க' -> Prediction: Non-Hate-Speech
Text: 'அவன் ஒரு பைத்தியம்' -> Prediction: Non-Hate-Speech
Text: 'Nee oru loosu payala da' -> Prediction: Hate-Speech
Text: 'Veliya poi sethudu da dai' -> Prediction: Hate-Speech
Text: 'Ivan madiri pasangala sethutanum' -> Prediction: Hate-Speech
Text: 'Enna mokka thalaiya da nee' -> Prediction: Hate-Speech
Text: 'I am a good person' -> Prediction: Non-Hate-Speech
Text: 'Dai thevidiya paiya' -> Prediction: Hate-Speech
Text: 'Dai poda punda' -> Prediction: Hate-Speech
Text: 'Ivan oru loose' -> Prediction: Hate-Speech


In [7]:
# ============================================
# 🌍 Tamil–Tanglish Hate Speech Detection (GPU) - To improve accuracy
# Optimized for Google Colab T4
# ============================================
!pip install transformers==4.30.2 --quiet
import pandas as pd
import torch
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments,
)
from transformers.integrations import EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# === 1. Check GPU ===
print("🔍 GPU Available:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

# === 2. Load Dataset ===
df = pd.read_csv("dataset_updated.csv", header=1)
df.dropna(subset=["text", "label"], inplace=True)
df = df[df["label"].isin(["Hate-Speech", "Non-Hate-Speech"])]

# Encode labels
df["label_encoded"] = df["label"].apply(lambda x: 1 if x == "Hate-Speech" else 0)
df = df.reset_index(drop=True)

# Split train / validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_encoded"].tolist(), test_size=0.2, random_state=42, stratify=df["label_encoded"]
)

# === 3. Tokenizer ===
MODEL_NAME = "bert-base-multilingual-cased"  # can switch to 'xlm-roberta-base'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# === 4. Dataset Class ===
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

# === 5. Model ===
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# === 6. Metrics ===
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1_score": f1}

# === 7. Training Arguments (Optimized for T4) ===
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",        # ✅ Correct key
    save_strategy="epoch",
    learning_rate=2e-5,                 # ✅ Lower LR for stability
    per_device_train_batch_size=32,     # ✅ Uses GPU memory efficiently
    per_device_eval_batch_size=32,
    num_train_epochs=3,                 # ✅ Reduced, safe for 25K dataset
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_score",
    greater_is_better=True,
    fp16=True,                          # ✅ Enables mixed precision on T4
    save_total_limit=1,
    seed=42,
    report_to=[]
)

# === 8. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # ✅ Stops early if F1 plateaus
)

# === 9. Train ===
print("🚀 Training started ...")
trainer.train()

# === 10. Evaluate ===
evaluation_results = trainer.evaluate()
print("📊 Evaluation Results:", evaluation_results)

# === 11. Save model & tokenizer ===
model.save_pretrained("./bert-tamil-tanglish-hate-speech-final")
tokenizer.save_pretrained("./bert-tamil-tanglish-hate-speech-final")
print("✅ Model saved successfully!")

# === 12. Prediction Function ===
def predict_batch(texts):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**encodings)
    preds = torch.argmax(outputs.logits, dim=1).tolist()
    return ["Hate-Speech" if p == 1 else "Non-Hate-Speech" for p in preds]

# === 13. Example Predictions ===
new_samples = [
    "நீ ஒரு மோசமான மனிதன்",
    "உன்னை சுட்டுவேன்",
    "நீ நல்லவங்க",
    "அவன் ஒரு பைத்தியம்",
    "Nee oru loosu payala da",
    "Veliya poi sethudu da dai",
    "Ivan madiri pasangala sethutanum",
    "Enna mokka thalaiya da nee",
    "I am a good person",
    "Dai thevidiya paiya",
    "Dai poda punda",
    "Ivan oru loose",
]
predictions = predict_batch(new_samples)
for text, pred in zip(new_samples, predictions):
    print(f"Text: '{text}' -> Prediction: {pred}")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tokenizers [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for tokenizers (pyproject.toml) ... [?25l[?25herror
[31m  ERROR: Failed building wheel for tokenizers[0m[31m
[0m[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (tokenizers)[0m[31m
[0m

ImportError: cannot import name 'EarlyStoppingCallback' from 'transformers.integrations' (/usr/local/lib/python3.12/dist-packages/transformers/integrations/__init__.py)

In [9]:
# ==============================================
# 🧱  Environment Setup for Tamil Hate Speech BERT
# ==============================================
!pip uninstall -y tokenizers transformers -q
!pip install -q transformers==4.44.2 tokenizers==0.19.1 accelerate==0.31.0 datasets==2.19.0 torch --upgrade


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m150.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [12]:
# ===============================================
# 🌍 Tamil–Tanglish Hate Speech Detection (BERT)
# Optimized for Google Colab T4 GPU
# ===============================================

import pandas as pd
import torch
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments
)
from transformers.integrations import EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# === 1. GPU check ===
print("🔍 GPU :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "⚠️ CPU only")

# === 2. Load dataset ===
df = pd.read_csv("dataset_updated.csv", header=1)
df.dropna(subset=["text", "label"], inplace=True)
df = df[df["label"].isin(["Hate-Speech", "Non-Hate-Speech"])]

# Encode labels
df["label_encoded"] = df["label"].apply(lambda x: 1 if x == "Hate-Speech" else 0)
df = df.reset_index(drop=True)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_encoded"].tolist(),
    test_size=0.2, random_state=42, stratify=df["label_encoded"]
)

# === 3. Tokenizer ===
MODEL_NAME = "bert-base-multilingual-cased"   # multilingual BERT
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_enc   = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# === 4. Dataset Class ===
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings, self.labels = encodings, labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = HateSpeechDataset(train_enc, train_labels)
val_dataset   = HateSpeechDataset(val_enc,   val_labels)

# === 5. Model ===
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# === 6. Metrics ===
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1_score": f1}

# === 7. Training Arguments ===
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_score",
    greater_is_better=True,
    fp16=True,                         # mixed precision for T4
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    save_total_limit=1,
    seed=42,
    report_to=[]
)

# === 8. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# === 9. Train ===
print("🚀 Training started ...")
trainer.train()

# === 10. Evaluate ===
eval_results = trainer.evaluate()
print("\n📊 Evaluation Results:", eval_results)

# === 11. Save Model ===
save_path = "./bert-tamil-tanglish-hate-speech-final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Model and tokenizer saved to {save_path}")

# === 12. Prediction Function ===
def predict_batch(texts):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    enc = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**enc)
    preds = torch.argmax(outputs.logits, dim=1).tolist()
    return ["Hate-Speech" if p == 1 else "Non-Hate-Speech" for p in preds]

# === 13. Example Predictions ===
samples = [
    "நீ ஒரு மோசமான மனிதன்",
    "உன்னை சுட்டுவேன்",
    "நீ நல்லவங்க",
    "அவன் ஒரு பைத்தியம்",
    "Nee oru loosu payala da",
    "Veliya poi sethudu da dai",
    "Ivan madiri pasangala sethutanum",
    "Enna mokka thalaiya da nee",
    "I am a good person",
    "Dai thevidiya paiya",
    "Dai poda punda",
    "Ivan oru loose",
]
preds = predict_batch(samples)
for t, p in zip(samples, preds):
    print(f"Text: '{t}' → Prediction: {p}")

ImportError: cannot import name 'EarlyStoppingCallback' from 'transformers.integrations' (/usr/local/lib/python3.12/dist-packages/transformers/integrations/__init__.py)

After running the cell above, please run the cell again.

In [None]:
# Uninstall incompatible versions
# NOTE: Run these commands in your shell before executing the rest of the python code
!pip uninstall -y torch torchvision torchaudio scikit-learn
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 scikit-learn==1.6.1

Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Collecting torch==2.6.0
  Downloading torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0
  Downloading torchvision-0.21.0-cp312-cp312-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0
  Downloading torchaudio-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting scikit-learn==1.6.1
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 

In [None]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [None]:
# Load dataset
df = pd.read_csv("dataset.csv")

In [None]:
# Drop missing values and filter for relevant labels
df.dropna(inplace=True)
df = df[df['label'].isin(['Hate-Speech', 'Non-Hate-Speech'])]

# Encode labels (1 = Hate, 0 = Non-Hate)
df['label_encoded'] = df['label'].apply(lambda x: 1 if x == 'Hate-Speech' else 0)

df = df.reset_index(drop=True)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label_encoded'].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer and encode data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Create a custom dataset class
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self,):
        return len(self.labels)

train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Define custom metrics function
def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1_score': f1}

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=[]
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()
print("Evaluation results:", evaluation_results)

# Save the model and tokenizer
model.save_pretrained("./bert-tamil-tanglish-hate-speech-final")
tokenizer.save_pretrained("./bert-tamil-tanglish-hate-speech-final")


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,0.650335,0.65308,0.516022
2,0.646000,0.563445,0.716486,0.699047
3,0.646000,0.511727,0.755435,0.745277


Evaluation results: {'eval_loss': 0.5117269158363342, 'eval_accuracy': 0.7554347826086957, 'eval_f1_score': 0.7452767390120945, 'eval_runtime': 7.6548, 'eval_samples_per_second': 144.224, 'eval_steps_per_second': 9.014, 'epoch': 3.0}


('./bert-tamil-tanglish-hate-speech-final/tokenizer_config.json',
 './bert-tamil-tanglish-hate-speech-final/special_tokens_map.json',
 './bert-tamil-tanglish-hate-speech-final/vocab.txt',
 './bert-tamil-tanglish-hate-speech-final/added_tokens.json')

In [None]:
# Define a prediction function
def predict_batch(texts):
    # Ensure the model and input are on the correct device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**encodings)
    preds = torch.argmax(outputs.logits, dim=1).tolist()
    return ["Hate-Speech" if p == 1 else "Non-Hate-Speech" for p in preds]

# Example prediction
new_samples = [
    "Raghul avonada gf ah thittunan ",
    "Dei waste da nee",
    "நீ நல்லவங்க",
    "அவன் ஒரு பைத்தியம்",
    "Nee oru loosu payala da",
    "Veliya poi sethudu da dai",
    "Ivan madiri pasangala sethutanum",
    "Enna mokka thalaiya da nee",
    "I am a good person",
    "Dai enna paiya da nee mundam",
    "Dai poda hairuh",
    "Ivan oru loose",
    "Seri ena panalam adhuku",
    "அவன் ஒரு வெளிநாட்டவனாக இருப்பதால் நம்ப முடியாது"

]
predictions = predict_batch(new_samples)
for text, pred in zip(new_samples, predictions):
    print(f"Text: '{text}' -> Prediction: {pred}")

Text: 'Raghul avonada gf ah thittunan ' -> Prediction: Non-Hate-Speech
Text: 'Dei waste da nee' -> Prediction: Hate-Speech
Text: 'நீ நல்லவங்க' -> Prediction: Non-Hate-Speech
Text: 'அவன் ஒரு பைத்தியம்' -> Prediction: Hate-Speech
Text: 'Nee oru loosu payala da' -> Prediction: Hate-Speech
Text: 'Veliya poi sethudu da dai' -> Prediction: Hate-Speech
Text: 'Ivan madiri pasangala sethutanum' -> Prediction: Hate-Speech
Text: 'Enna mokka thalaiya da nee' -> Prediction: Hate-Speech
Text: 'I am a good person' -> Prediction: Non-Hate-Speech
Text: 'Dai enna paiya da nee mundam' -> Prediction: Hate-Speech
Text: 'Dai poda hairuh' -> Prediction: Hate-Speech
Text: 'Ivan oru loose' -> Prediction: Hate-Speech
Text: 'Seri ena panalam adhuku' -> Prediction: Non-Hate-Speech
Text: 'அவன் ஒரு வெளிநாட்டவனாக இருப்பதால் நம்ப முடியாது' -> Prediction: Hate-Speech


In [None]:
# --- Interactive text box in Google Colab ---
!pip install ipywidgets --quiet
from IPython.display import display
import ipywidgets as widgets

# Create input and button widgets
text_box = widgets.Textarea(
    value='',
    placeholder='Type your text here...',
    description='Input:',
    layout=widgets.Layout(width='100%', height='80px')
)

button = widgets.Button(
    description='Predict',
    button_style='info',
    tooltip='Click to get prediction',
    icon='search'
)

output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        text = text_box.value.strip()
        if text:
            result = predict_batch([text])[0]
            print(f"Prediction: {result}")
        else:
            print("⚠️ Please enter some text!")

button.on_click(on_button_click)

display(text_box, button, output)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Textarea(value='', description='Input:', layout=Layout(height='80px', width='100%'), placeholder='Type your te…

Button(button_style='info', description='Predict', icon='search', style=ButtonStyle(), tooltip='Click to get p…

Output()