<a href="https://colab.research.google.com/github/Rakib911Hossan/hate_speech_detection/blob/main/subtask_1a_xlm_roberta_qwen_label4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# =============================================================================
# HATE SPEECH CLASSIFICATION WITH XLM-ROBERTA AND QWEN AUTO-LABELING
# Complete Google Colab Notebook
# =============================================================================

# Cell 1: Install Dependencies
# =============================================================================
!pip install transformers datasets scikit-learn torch openai accelerate evaluate
!pip install --upgrade huggingface_hub

# Cell 2: Download Dataset
# =============================================================================
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1A/blp25_hatespeech_subtask_1A_train.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1A/blp25_hatespeech_subtask_1A_dev.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1A/blp25_hatespeech_subtask_1A_dev_test.tsv

# Cell 2: Import Libraries and Setup
# =============================================================================
import logging
import os
import random
import sys
import time
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import torch
from torch.nn import CrossEntropyLoss
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.utils.class_weight import compute_class_weight
import evaluate
from openai import OpenAI
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# Set seed for reproducibility
set_seed(42)

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
--2025-09-01 04:17:52--  https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1A/blp25_hatespeech_subtask_1A_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8002036 (7.6M) [text/plain]
Saving to: ‘blp25_hatespeech_subtask_1A_train.tsv’


2025-09-01 04:17:52 (81.0 MB/s) - ‘blp25_hatespeech_subtask_1A_train.tsv’ saved [8002036/8002036]

--2025-09-01 04:17:52--  https://raw.githubusercont

In [2]:
# Cell 4: Define Label Mapping and Optimized Configuration
# =============================================================================
# Label to ID mapping
l2id = {
    "None": 0,
    "Religious Hate": 1,
    "Sexism": 2,
    "Political Hate": 3,
    "Profane": 4,
    "Abusive": 5
}

id2l = {v: k for k, v in l2id.items()}

# 🎯 OPTIMIZED CONFIGURATION BASED ON SUCCESSFUL RESULTS
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 512
BATCH_SIZE = 8                           # Optimized batch size
LEARNING_RATE = 10e-6                    # Lower LR for more stable convergence
NUM_EPOCHS = 8                           # More epochs with early stopping
WARMUP_RATIO = 0.15                      # Longer warmup for stability
WEIGHT_DECAY = 0.01                      # Stronger regularization
MAX_GRAD_NORM = 0.5                      # Tighter gradient clipping

# 🎯 OPTIMIZED DATA PARAMETERS
max_train_samples = None
max_eval_samples = None
max_predict_samples = None
max_seq_length = 512
batch_size = 8

print(f"Label mapping: {l2id}")
print(f"Using model: {MODEL_NAME}")
print(f"Optimized configuration loaded with early stopping and regularization")




Label mapping: {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
Using model: xlm-roberta-base
Optimized configuration loaded with early stopping and regularization


In [3]:
import torch
import gc

# Clear PyTorch GPU cache
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()


90

In [4]:
# # Cell 5: Setup Qwen Hugging Face Model for Auto-labeling
# # =============================================================================


# !pip install huggingface_hub -q

# from huggingface_hub import login

# # Paste your HF token here (get it from https://huggingface.co/settings/tokens)
# login(token="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")


# !pip install transformers accelerate bitsandbytes -q

# import torch
# import time
# import pandas as pd
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# # Choose a smaller model if GPU is limited (Colab T4 works better with 0.5B or 1.8B)
# MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

# # Load tokenizer & model with memory-efficient options
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=True)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     device_map="auto",
#     torch_dtype=torch.float16
# )

# # Setup pipeline for generation
# qwen_pipeline = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
# )

# # Label mapping dictionary
# l2id = {
#     "None": 0,
#     "Religious Hate": 1,
#     "Sexism": 2,
#     "Political Hate": 3,
#     "Profane": 4,
#     "Abusive": 5
# }

# def classify_text_with_qwen(text: str) -> str:
#     """
#     Classify a single text using Qwen from Hugging Face with strict multi-class prompt
#     """
#     prompt = f"""You are a hate speech classification expert.
# Classify the following text into exactly one of these categories:

# None, Religious Hate, Sexism, Political Hate, Profane, Abusive

# Text: "{text}"

# Important: Respond with ONLY one category name from the list above."""

#     try:
#         output = qwen_pipeline(
#             prompt,
#             max_new_tokens=10,
#             temperature=0.0,
#             do_sample=False
#         )[0]["generated_text"]

#         # Extract the last valid label
#         for label in l2id.keys():
#             if label in output:
#                 return label

#         print(f"⚠️ Invalid prediction '{output}' for text: {text[:50]}...")
#         return "None"  # fallback

#     except Exception as e:
#         print(f"Error classifying text: {e}")
#         return "None"

# def auto_label_missing_data(df: pd.DataFrame, batch_size: int = 16) -> pd.DataFrame:
#     """
#     Auto-label missing data using Hugging Face Qwen with efficient batching.
#     """
#     print("Starting auto-labeling with Hugging Face Qwen...")
#     df_copy = df.copy()
#     missing_mask = df_copy['label'].isna()
#     missing_count = missing_mask.sum()

#     if missing_count == 0:
#         print("No missing labels found.")
#         return df_copy

#     print(f"Auto-labeling {missing_count} missing samples...")

#     missing_indices = df_copy[missing_mask].index.tolist()

#     for i in range(0, len(missing_indices), batch_size):
#         batch_indices = missing_indices[i:i+batch_size]
#         texts = df_copy.loc[batch_indices, 'text'].tolist()

#         # Build prompts for the batch
#         prompts = [
#             f"""You are a hate speech classification expert.
# Classify the following text into exactly one of these categories:

# None, Religious Hate, Sexism, Political Hate, Profane, Abusive

# Text: "{t}"

# Important: Respond with ONLY one category name from the list above."""
#             for t in texts
#         ]

#         try:
#             outputs = qwen_pipeline(
#                 prompts,
#                 max_new_tokens=10,
#                 do_sample=False,
#                 batch_size=batch_size
#             )
#         except Exception as e:
#             print(f"Error in batch {i//batch_size + 1}: {e}")
#             outputs = [[{"generated_text": ""}] for _ in prompts]

#         # ✅ Handle nested list structure
#         predicted_labels = []
#         for out_group in outputs:
#             if isinstance(out_group, list):
#                 text_out = out_group[0]["generated_text"]
#             else:
#                 text_out = out_group["generated_text"]

#             found = next((label for label in l2id if label in text_out), "None")
#             predicted_labels.append(found)

#         # Assign predictions back to dataframe
#         for idx, label in zip(batch_indices, predicted_labels):
#             df_copy.loc[idx, 'label'] = label

#         print(f"✅ Processed batch {i//batch_size + 1}/{(len(missing_indices)-1)//batch_size + 1}")

#     print("🎉 Auto-labeling completed!")
#     return df_copy



In [5]:
# # Cell 6: Load and Process Data
# # =============================================================================
# def load_and_process_data():
#     """Load and process all datasets"""

#     # Load datasets
#     print("Loading datasets...")
#     train_df = pd.read_csv('blp25_hatespeech_subtask_1A_train.tsv', sep='\t')
#     val_df = pd.read_csv('blp25_hatespeech_subtask_1A_dev.tsv', sep='\t')
#     test_df = pd.read_csv('blp25_hatespeech_subtask_1A_dev_test.tsv', sep='\t')

#     print(f"Original dataset sizes:")
#     print(f"Train: {len(train_df)} samples")
#     print(f"Validation: {len(val_df)} samples")
#     print(f"Test: {len(test_df)} samples")

#     # Check columns in each dataset
#     print(f"\nDataset columns:")
#     print(f"Train columns: {list(train_df.columns)}")
#     print(f"Validation columns: {list(val_df.columns)}")
#     print(f"Test columns: {list(test_df.columns)}")

#     # Add label column to test set if it doesn't exist
#     if 'label' not in test_df.columns:
#         print("Test dataset has no 'label' column. Adding empty label column.")
#         test_df['label'] = None

#     # Check missing labels
#     print(f"\nMissing labels:")
#     print(f"Train: {train_df['label'].isna().sum()}")
#     print(f"Validation: {val_df['label'].isna().sum()}")
#     print(f"Test: {test_df['label'].isna().sum()}")

#     # Auto-label missing data
#     train_df = auto_label_missing_data(train_df)
#     val_df = auto_label_missing_data(val_df)
#     test_df = auto_label_missing_data(test_df)

#     # Verify no missing labels remain
#     print(f"\nAfter auto-labeling - Missing labels:")
#     print(f"Train: {train_df['label'].isna().sum()}")
#     print(f"Validation: {val_df['label'].isna().sum()}")
#     print(f"Test: {test_df['label'].isna().sum()}")

#     return train_df, val_df, test_df

# # Load the data
# train_df, val_df, test_df = load_and_process_data()

In [6]:
# # Cell 7: Exploratory Data Analysis
# # =============================================================================
# def analyze_data(train_df, val_df, test_df):
#     """Perform exploratory data analysis"""

#     print("=== TRAINING DATA ANALYSIS ===")
#     print(f"Shape: {train_df.shape}")
#     print(f"\nLabel distribution:")
#     label_counts = train_df['label'].value_counts()
#     label_percentages = train_df['label'].value_counts(normalize=True) * 100

#     for label in label_counts.index:
#         print(f"{label}: {label_counts[label]} ({label_percentages[label]:.2f}%)")

#     print(f"\nText statistics:")
#     train_df['text_length'] = train_df['text'].str.len()
#     print(f"Mean text length: {train_df['text_length'].mean():.2f}")
#     print(f"Max text length: {train_df['text_length'].max()}")
#     print(f"Min text length: {train_df['text_length'].min()}")

#     print(f"\nValidation set label distribution:")
#     print(val_df['label'].value_counts())

#     return label_counts

# label_counts = analyze_data(train_df, val_df, test_df)

In [7]:
# # Cell 8: Convert Labels to Numeric IDs
# # =============================================================================
# def convert_labels_to_ids(df):
#     """Convert string labels to numeric IDs"""
#     df_copy = df.copy()
#     df_copy['labels'] = df_copy['label'].map(l2id)

#     # Check for any unmapped labels
#     unmapped = df_copy['labels'].isna().sum()
#     if unmapped > 0:
#         print(f"Warning: {unmapped} labels could not be mapped")
#         print("Unmapped labels:", df_copy[df_copy['labels'].isna()]['label'].unique())
#         # Fill unmapped with 0 (None)
#         df_copy['labels'] = df_copy['labels'].fillna(0)

#     return df_copy

# import pandas as pd
# # Convert labels for all datasets
# train_df = convert_labels_to_ids(train_df)
# val_df = convert_labels_to_ids(val_df)
# test_df = convert_labels_to_ids(test_df)


# print("Label conversion completed!")
# print(f"Training set label distribution (numeric):")
# print(train_df['labels'].value_counts().sort_index())

# from google.colab import drive
# drive.mount('/content/drive')

# # Example: save converted CSVs to a folder in your Drive
# train_df.to_csv('/content/drive/MyDrive/ColabFiles/train_converted.csv', index=False)
# val_df.to_csv('/content/drive/MyDrive/ColabFiles/val_converted.csv', index=False)
# test_df.to_csv('/content/drive/MyDrive/ColabFiles/test_converted.csv', index=False)


# print("Files saved to /content folder:")
# print(" - train_converted.csv")
# print(" - val_converted.csv")
# print(" - test_converted.csv")


In [12]:
# Cell 9: Setup Tokenization
# =============================================================================
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Ensure pad token is defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"  # recommended for decoder-only models

# Updated tokenize function
def tokenize_function(examples):
    """Tokenize text data with padding for batching"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # pad to MAX_LENGTH
        max_length=MAX_LENGTH,
        return_tensors=None
    )
from google.colab import drive
# drive.mount('/content/drive')

# Mount Google Drive
drive.mount('/content/drive')

# Load your saved CSVs from Drive
train_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/train_converted.csv')
val_df   = pd.read_csv('/content/drive/MyDrive/ColabFiles/val_converted.csv')
test_df  = pd.read_csv('/content/drive/MyDrive/ColabFiles/test_converted.csv')

# Convert to HuggingFace datasets
print("Converting to HuggingFace datasets...")

train_dataset = Dataset.from_pandas(train_df[['text', 'labels']])
val_dataset   = Dataset.from_pandas(val_df[['text', 'labels']])
test_dataset  = Dataset.from_pandas(test_df[['text', 'labels']])

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# Set format for Trainer
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Print dataset sizes
print(f"Dataset sizes after tokenization:")
print(f"Train: {len(train_dataset)}")
print(f"Validation: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")

Loading tokenizer...
Mounted at /content/drive
Converting to HuggingFace datasets...


Map:   0%|          | 0/35522 [00:00<?, ? examples/s]

Map:   0%|          | 0/2512 [00:00<?, ? examples/s]

Map:   0%|          | 0/2512 [00:00<?, ? examples/s]

Dataset sizes after tokenization:
Train: 35522
Validation: 2512
Test: 2512


In [13]:
# Cell 7: Exploratory Data Analysis
# =============================================================================
def analyze_data(train_df, val_df, test_df):
    """Perform exploratory data analysis"""

    print("=== TRAINING DATA ANALYSIS ===")
    print(f"Shape: {train_df.shape}")
    print(f"\nLabel distribution:")
    label_counts = train_df['label'].value_counts()
    label_percentages = train_df['label'].value_counts(normalize=True) * 100

    for label in label_counts.index:
        print(f"{label}: {label_counts[label]} ({label_percentages[label]:.2f}%)")

    print(f"\nText statistics:")
    train_df['text_length'] = train_df['text'].str.len()
    print(f"Mean text length: {train_df['text_length'].mean():.2f}")
    print(f"Max text length: {train_df['text_length'].max()}")
    print(f"Min text length: {train_df['text_length'].min()}")

    print(f"\nValidation set label distribution:")
    print(val_df['label'].value_counts())

    #     # Check missing labels
    print(f"\nMissing labels:")
    print(f"Train: {train_df['label'].isna().sum()}")
    print(f"Validation: {val_df['label'].isna().sum()}")
    print(f"Test: {test_df['label'].isna().sum()}")


    return label_counts

label_counts = analyze_data(train_df, val_df, test_df)

=== TRAINING DATA ANALYSIS ===
Shape: (35522, 5)

Label distribution:
Abusive: 8212 (52.75%)
Political Hate: 4227 (27.15%)
Profane: 2331 (14.97%)
Religious Hate: 676 (4.34%)
Sexism: 122 (0.78%)

Text statistics:
Mean text length: 78.21
Max text length: 3710
Min text length: 7

Validation set label distribution:
label
Abusive           564
Political Hate    291
Profane           157
Religious Hate     38
Sexism             11
Name: count, dtype: int64


In [14]:
# Cell 10: Compute Class Weights for Balanced Training
# =============================================================================
def compute_class_weights(labels):
    """Compute class weights for balanced training"""
    unique_labels = np.unique(labels)
    class_weights = compute_class_weight(
        'balanced',
        classes=unique_labels,
        y=labels
    )

    # Create weight dict
    weight_dict = {int(label): weight for label, weight in zip(unique_labels, class_weights)}

    print("Class weights:")
    for label_id, weight in weight_dict.items():
        label_name = id2l[label_id]
        print(f"  {label_name} (ID: {label_id}): {weight:.3f}")

    return torch.tensor(class_weights, dtype=torch.float32)

# Compute class weights
class_weights = compute_class_weights(train_df['labels'].values)

# Cell 11: Custom Trainer with Weighted Loss
# =============================================================================
class WeightedTrainer(Trainer):
    """Custom Trainer with weighted CrossEntropyLoss"""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        """Compute weighted loss"""
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Move class weights to same device as logits
        if self.class_weights is not None:
            weights = self.class_weights.to(logits.device)
            loss_fct = CrossEntropyLoss(weight=weights)
        else:
            loss_fct = CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

Class weights:
  None (ID: 0): 0.297
  Religious Hate (ID: 1): 8.758
  Sexism (ID: 2): 48.527
  Political Hate (ID: 3): 1.401
  Profane (ID: 4): 2.540
  Abusive (ID: 5): 0.721


In [15]:
# Cell 10: Compute Class Weights for Balanced Training
# =============================================================================
def compute_class_weights(labels):
    """Compute class weights for balanced training"""
    unique_labels = np.unique(labels)
    class_weights = compute_class_weight(
        'balanced',
        classes=unique_labels,
        y=labels
    )

    # Create weight dict
    weight_dict = {int(label): weight for label, weight in zip(unique_labels, class_weights)}

    print("Class weights:")
    for label_id, weight in weight_dict.items():
        label_name = id2l[label_id]
        print(f"  {label_name} (ID: {label_id}): {weight:.3f}")

    return torch.tensor(class_weights, dtype=torch.float32)

# Compute class weights
class_weights = compute_class_weights(train_df['labels'].values)

# Cell 11: Custom Trainer with Weighted Loss
# =============================================================================
class WeightedTrainer(Trainer):
    """Custom Trainer with weighted CrossEntropyLoss"""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """Compute weighted loss"""
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Move class weights to same device as logits
        if self.class_weights is not None:
            weights = self.class_weights.to(logits.device)
            loss_fct = CrossEntropyLoss(weight=weights)
        else:
            loss_fct = CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

Class weights:
  None (ID: 0): 0.297
  Religious Hate (ID: 1): 8.758
  Sexism (ID: 2): 48.527
  Political Hate (ID: 3): 1.401
  Profane (ID: 4): 2.540
  Abusive (ID: 5): 0.721


In [16]:
# Cell 12: Define Evaluation Metrics
# =============================================================================
def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro', zero_division=0
    )

    # Per-class metrics
    precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )

    metrics = {
        'accuracy': accuracy,
        'f1_macro': f1,
        'precision_macro': precision,
        'recall_macro': recall,
    }

    # Add per-class metrics
    for i, label_name in id2l.items():
        if i < len(f1_per_class):
            metrics[f'f1_{label_name.replace(" ", "_")}'] = f1_per_class[i]
            metrics[f'precision_{label_name.replace(" ", "_")}'] = precision_per_class[i]
            metrics[f'recall_{label_name.replace(" ", "_")}'] = recall_per_class[i]

    return metrics

In [17]:
# Cell 13: Load Model and Setup Optimized Training Configuration
# =============================================================================
print("Loading model...")
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(l2id),
    id2label=id2l,
    label2id=l2id,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=6,
    device_map="auto",
    torch_dtype=torch.float32  # use FP32 instead
)

# Tell the model what pad token ID to use
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Model loaded with {len(l2id)} classes")

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 🎯 OPTIMIZED TRAINING ARGUMENTS BASED ON YOUR SUCCESSFUL CONFIGURATION
training_args = TrainingArguments(
    output_dir="./xlm_roberta_recovery/",
    overwrite_output_dir=True,

    # 🔧 ADJUSTED LEARNING SCHEDULE - Lower LR for stable convergence
    learning_rate=LEARNING_RATE,           # 10e-6 for stability
    num_train_epochs=4,           # 8 epochs with early stopping
    warmup_ratio=WARMUP_RATIO,             # 0.15 for longer warmup
    lr_scheduler_type="cosine",            # Smoother LR decay

    # 🛡️ STRONGER REGULARIZATION - Combat overfitting
    weight_decay=WEIGHT_DECAY,             # 0.01 regularization
    max_grad_norm=MAX_GRAD_NORM,           # 0.5 gradient clipping
    dataloader_drop_last=True,             # Consistent batch sizes

    # ✅ OPTIMIZED BATCH CONFIGURATION
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,

    # 🎯 EARLY STOPPING CONFIGURATION
    eval_strategy="epoch",           # Eval every epoch
    save_strategy="epoch",                 # Save every epoch
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro", # Use macro F1 as primary metric
    greater_is_better=True,

    # 📊 COMPREHENSIVE MONITORING
    logging_steps=250,                     # More frequent logging
    save_total_limit=3,                    # Keep only best 3 checkpoints

    # 🔧 SYSTEM OPTIMIZATIONS
    report_to=None,                        # Disable wandb
    dataloader_num_workers=2,
    fp16=False,                            # Mixed precision for efficiency
    group_by_length=True,                 # Batch similar lengths together
    seed=42,
)

# 🛑 MANDATORY EARLY STOPPING - Prevent overfitting
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,            # Stop after 2 epochs without improvement
    early_stopping_threshold=0.001        # Minimum improvement threshold
)

print("Optimized training arguments configured with early stopping")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Max epochs: {NUM_EPOCHS}")
print(f"Early stopping patience: 2 epochs")

Loading model...


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 6 classes
Optimized training arguments configured with early stopping
Learning rate: 1e-05
Batch size: 8
Max epochs: 8
Early stopping patience: 2 epochs


In [18]:
import torch
import gc

# Clear PyTorch GPU cache
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()


96

In [19]:
# Cell 14: Initialize Trainer and Start Training
# =============================================================================
# Initialize trainer with weighted loss
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
print(f"Training on {len(train_dataset)} samples")
print(f"Validating on {len(val_dataset)} samples")

# Train the model
trainer.train()

print("Training completed!")

Starting training...
Training on 35522 samples
Validating on 2512 samples


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrakib911hossan[0m ([33mrakib911hossan-bangladesh-university-of-business-and-tec[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 None,Precision None,Recall None,F1 Religious Hate,Precision Religious Hate,Recall Religious Hate,F1 Sexism,Precision Sexism,Recall Sexism,F1 Political Hate,Precision Political Hate,Recall Political Hate,F1 Profane,Precision Profane,Recall Profane,F1 Abusive,Precision Abusive,Recall Abusive
1,1.0929,0.966528,0.666003,0.528364,0.522995,0.571981,0.771917,0.837228,0.716058,0.380952,0.298507,0.526316,0.25,0.4,0.181818,0.568365,0.465934,0.728522,0.70255,0.632653,0.789809,0.496403,0.50365,0.489362
2,0.9671,0.914041,0.666003,0.548824,0.501977,0.638509,0.760093,0.870222,0.674707,0.448598,0.347826,0.631579,0.25,0.190476,0.363636,0.565986,0.468468,0.714777,0.734043,0.630137,0.878981,0.534224,0.504732,0.567376
3,0.8373,0.911352,0.679538,0.561765,0.527738,0.625071,0.775146,0.8879,0.687802,0.458333,0.37931,0.578947,0.272727,0.272727,0.272727,0.563969,0.454737,0.742268,0.749326,0.649533,0.88535,0.551089,0.522222,0.583333


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 None,Precision None,Recall None,F1 Religious Hate,Precision Religious Hate,Recall Religious Hate,F1 Sexism,Precision Sexism,Recall Sexism,F1 Political Hate,Precision Political Hate,Recall Political Hate,F1 Profane,Precision Profane,Recall Profane,F1 Abusive,Precision Abusive,Recall Abusive
1,1.0929,0.966528,0.666003,0.528364,0.522995,0.571981,0.771917,0.837228,0.716058,0.380952,0.298507,0.526316,0.25,0.4,0.181818,0.568365,0.465934,0.728522,0.70255,0.632653,0.789809,0.496403,0.50365,0.489362
2,0.9671,0.914041,0.666003,0.548824,0.501977,0.638509,0.760093,0.870222,0.674707,0.448598,0.347826,0.631579,0.25,0.190476,0.363636,0.565986,0.468468,0.714777,0.734043,0.630137,0.878981,0.534224,0.504732,0.567376
3,0.8373,0.911352,0.679538,0.561765,0.527738,0.625071,0.775146,0.8879,0.687802,0.458333,0.37931,0.578947,0.272727,0.272727,0.272727,0.563969,0.454737,0.742268,0.749326,0.649533,0.88535,0.551089,0.522222,0.583333
4,0.7696,0.931508,0.692675,0.573149,0.536574,0.63188,0.78696,0.874473,0.715369,0.510204,0.416667,0.657895,0.24,0.214286,0.272727,0.5686,0.483173,0.690722,0.772334,0.705263,0.853503,0.560794,0.525581,0.601064


Training completed!


In [20]:
# Cell 15: Evaluation on Test Set
# =============================================================================
print("Evaluating on test set...")

# Make predictions on test set
test_predictions = trainer.predict(test_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)
test_labels = test_predictions.label_ids

# Compute final metrics
final_metrics = compute_metrics((test_predictions.predictions, test_labels))

print("=== FINAL TEST RESULTS ===")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Macro F1: {final_metrics['f1_macro']:.4f}")
print(f"Macro Precision: {final_metrics['precision_macro']:.4f}")
print(f"Macro Recall: {final_metrics['recall_macro']:.4f}")


Evaluating on test set...


=== FINAL TEST RESULTS ===
Accuracy: 0.4817
Macro F1: 0.1084
Macro Precision: 0.1667
Macro Recall: 0.0803


In [27]:
# Cell 16: Detailed Classification Report
# =============================================================================
print("\n=== DETAILED CLASSIFICATION REPORT ===")

# Generate classification report
target_names = [id2l[i] for i in range(len(l2id))]
report = classification_report(
    test_labels,
    test_preds,
    target_names=target_names,
    digits=4,
    zero_division=0
)

print(report)

# Print per-class results
print("\n=== PER-CLASS RESULTS ===")
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    test_labels, test_preds, average=None, zero_division=0
)

for i, label_name in enumerate(target_names):
    if i < len(f1_per_class):
        print(f"{label_name}:")
        print(f"  Precision: {precision_per_class[i]:.4f}")
        print(f"  Recall: {recall_per_class[i]:.4f}")
        print(f"  F1-Score: {f1_per_class[i]:.4f}")
        print(f"  Support: {support_per_class[i]}")
        print()

 #     # Check missing labels
print(f"\nMissing labels:")
print(f"Train: {train_df['labels'].isna().sum()}")
print(f"Validation: {val_df['labels'].isna().sum()}")
print(f"Test: {test_df['labels'].isna().sum()}")


=== DETAILED CLASSIFICATION REPORT ===
                precision    recall  f1-score   support

          None     1.0000    0.4817    0.6502      2512
Religious Hate     0.0000    0.0000    0.0000         0
        Sexism     0.0000    0.0000    0.0000         0
Political Hate     0.0000    0.0000    0.0000         0
       Profane     0.0000    0.0000    0.0000         0
       Abusive     0.0000    0.0000    0.0000         0

      accuracy                         0.4817      2512
     macro avg     0.1667    0.0803    0.1084      2512
  weighted avg     1.0000    0.4817    0.6502      2512


=== PER-CLASS RESULTS ===
None:
  Precision: 1.0000
  Recall: 0.4817
  F1-Score: 0.6502
  Support: 2512

Religious Hate:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  Support: 0

Sexism:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  Support: 0

Political Hate:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  Support: 0

Profane:
  Precision: 0.0000
  Recall: 0.

In [24]:
 git config --global user.email "rakib911hossan@gmail.com"
  git config --global user.name "Rakib911Hossan"

!git clone https://github.com/Rakib911Hossan/hate_speech_detection.git
%cd hate_speech_detection
# move your notebook here
!git add subtask_1a_xlm_roberta_qwen_label2.ipynb
!git commit -m "Update notebook"
!git push origin main


SyntaxError: invalid syntax (ipython-input-2228826456.py, line 1)

In [None]:
# Cell 17: Save Optimized Model and Results
# =============================================================================
print("💾 Saving optimized model and tokenizer...")

# Save the fine-tuned model with optimized configuration
model.save_pretrained("./xlm-roberta-hate-speech-optimized")
tokenizer.save_pretrained("./xlm-roberta-hate-speech-optimized")

# Save detailed results
results_df = pd.DataFrame({
    'id': test_df['id'].values,
    'text': test_df['text'].values,
    'true_label': [id2l[label] for label in test_labels],
    'predicted_label': [id2l[pred] for pred in test_preds],
    'correct': test_labels == test_preds
})

results_df.to_csv('optimized_test_results.csv', index=False)

# Save training configuration for reference
config_info = {
    'model_name': MODEL_NAME,
    'learning_rate': LEARNING_RATE,
    'batch_size': BATCH_SIZE,
    'max_epochs': NUM_EPOCHS,
    'warmup_ratio': WARMUP_RATIO,
    'weight_decay': WEIGHT_DECAY,
    'max_grad_norm': MAX_GRAD_NORM,
    'early_stopping_patience': 2,
    'scheduler': 'cosine',
    'fp16': True,
    'final_test_accuracy': (test_labels == test_preds).mean(),
    'final_macro_f1': final_metrics['f1_macro']
}

import json
with open('training_config.json', 'w') as f:
    json.dump(config_info, f, indent=2)

print("✅ Model and results saved!")
print(f"🎯 Test accuracy: {(test_labels == test_preds).mean():.4f}")
print(f"📈 Macro F1: {final_metrics['f1_macro']:.4f}")
print(f"📁 Model saved to: ./xlm-roberta-hate-speech-optimized")
print(f"📊 Results saved to: optimized_test_results.csv")
print(f"⚙️ Config saved to: training_config.json")

In [None]:
# Cell 18: Example Predictions
# =============================================================================
def predict_text(text, model, tokenizer, device='cpu'):
    """Predict label for a single text"""
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_id = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_id].item()

    return id2l[predicted_id], confidence

# Test with some examples
print("\n=== EXAMPLE PREDICTIONS ===")
example_texts = [
    "This is a normal text with no hate speech",
    "I hate all politicians they are corrupt",
    "Religious people are stupid and should be eliminated",
    "Women belong in the kitchen not in workplace"
]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for text in example_texts:
    pred_label, confidence = predict_text(text, model, tokenizer, device)
    print(f"Text: {text[:50]}...")
    print(f"Prediction: {pred_label} (confidence: {confidence:.3f})")
    print()

print("=== TRAINING COMPLETE ===")
print("Your hate speech classifier is ready to use!")

In [None]:
# Cell 19: Optimized Model Summary and Final Statistics
# =============================================================================
print("\n" + "="*60)
print("🎉 OPTIMIZED TRAINING COMPLETE - FINAL SUMMARY")
print("="*60)
print(f"🤖 Model: {MODEL_NAME}")
print(f"📊 Training samples: {len(train_dataset)}")
print(f"📊 Validation samples: {len(val_dataset)}")
print(f"📊 Test samples: {len(test_dataset)}")
print(f"🏷️  Number of classes: {len(l2id)}")
print(f"📝 Max sequence length: {MAX_LENGTH}")

print(f"\n🔧 OPTIMIZED HYPERPARAMETERS:")
print(f"  • Batch size: {BATCH_SIZE}")
print(f"  • Learning rate: {LEARNING_RATE}")
print(f"  • Max epochs: {NUM_EPOCHS}")
print(f"  • Warmup ratio: {WARMUP_RATIO}")
print(f"  • Weight decay: {WEIGHT_DECAY}")
print(f"  • Gradient clipping: {MAX_GRAD_NORM}")
print(f"  • Scheduler: cosine")
print(f"  • Early stopping: patience=2")
print(f"  • Mixed precision: FP16")

print(f"\n🎯 FINAL PERFORMANCE:")
print(f"  • Test Accuracy: {final_metrics['accuracy']:.4f}")
print(f"  • Macro F1-Score: {final_metrics['f1_macro']:.4f}")
print(f"  • Macro Precision: {final_metrics['precision_macro']:.4f}")
print(f"  • Macro Recall: {final_metrics['recall_macro']:.4f}")

print(f"\n📁 FILES GENERATED:")
print(f"  • ./xlm-roberta-hate-speech-optimized/ (optimized model)")
print(f"  • optimized_test_results.csv (detailed predictions)")
print(f"  • training_config.json (hyperparameter configuration)")

print(f"\n🔄 TO LOAD THE OPTIMIZED MODEL LATER:")
print("```python")
print("from transformers import AutoModelForSequenceClassification, AutoTokenizer")
print("model = AutoModelForSequenceClassification.from_pretrained('./xlm-roberta-hate-speech-optimized')")
print("tokenizer = AutoTokenizer.from_pretrained('./xlm-roberta-hate-speech-optimized')")
print("```")

print(f"\n🚀 OPTIMIZATIONS APPLIED:")
print("  ✅ Lower learning rate for stable convergence")
print("  ✅ Early stopping to prevent overfitting")
print("  ✅ Cosine learning rate scheduling")
print("  ✅ Stronger regularization (weight decay)")
print("  ✅ Gradient clipping for stability")
print("  ✅ Mixed precision training")
print("  ✅ Weighted loss for class balance")
print("  ✅ Longer warmup period")

print("="*60)
print("🎊 Your optimized hate speech classifier is ready!")
print("="*60)