In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_dev.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_dev_test.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_train.tsv
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install torch
!pip install scikit-learn
# !pip install --upgrade accelerate
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import datasets
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import torch
from sklearn.model_selection import StratifiedKFold

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from transformers.modeling_outputs import SequenceClassifierOutput


logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

# FreeLB Adversarial Training Configuration
FREELB_CONFIG = {
    'enabled': False,          # TEMPORARILY DISABLED due to memory constraints
    'epsilon': 0.3,           # Maximum perturbation norm (paper recommends 0.3-1.0)
    'K': 2,                   # Number of adversarial steps (reduced from 3 to save memory)
    'alpha': 0.15,            # Step size for adversarial updates (epsilon/K)
    'use_grad_acc': True,     # Use gradient accumulation for memory efficiency
}

# Alternative: Lighter FreeLB config for limited memory
FREELB_LIGHT_CONFIG = {
    'enabled': True,          # Enable lighter version
    'epsilon': 0.1,           # Smaller perturbation
    'K': 1,                   # Single step (essentially FGM+)
    'alpha': 0.1,             # Single step size
}

train_file = 'blp25_hatespeech_subtask_1C_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1C_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1C_dev_test.tsv'
import os
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=8,  # Reduced from 16 to save memory with FreeLB
    per_device_eval_batch_size=8,   # Reduced from 16 to save memory
    output_dir="./distilBERT_m/",
    overwrite_output_dir=True,
    remove_unused_columns=False,
    local_rank= 1,
    load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="no",
    report_to=None
)

max_train_samples = None
max_eval_samples=None
max_predict_samples=None
max_seq_length = 512
batch_size = 16
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
model_name = 'csebuetnlp/banglabert'
set_seed(training_args.seed)
hate_type_map = {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
severity_map = {'Little to None': 0, 'Mild': 1, 'Severe': 2}
to_whom_map = {'None': 0, 'Individual': 1, 'Organization': 2, 'Community': 3, 'Society': 4}
id2hate = {v: k for k, v in hate_type_map.items()}
id2sev = {v: k for k, v in severity_map.items()}
id2to = {v: k for k, v in to_whom_map.items()}

# Load training and validation data
train_df = pd.read_csv(train_file, sep='\t')

train_df['hate_type'] = train_df['hate_type'].fillna('None')
train_df['to_whom'] = train_df['to_whom'].fillna('None')
train_df['hate_type'] = train_df['hate_type'].map(hate_type_map).astype(int)
train_df['hate_severity'] = train_df['hate_severity'].map(severity_map).astype(int)
train_df['to_whom'] = train_df['to_whom'].map(to_whom_map).astype(int)

validation_df = pd.read_csv(validation_file, sep='\t')
validation_df['hate_type'] = validation_df['hate_type'].replace('nan', 'None').fillna('None')
validation_df['to_whom'] = validation_df['to_whom'].replace('nan', 'None').fillna('None')
validation_df['hate_type'] = validation_df['hate_type'].map(hate_type_map).astype(int)
validation_df['hate_severity'] = validation_df['hate_severity'].map(severity_map).astype(int)
validation_df['to_whom'] = validation_df['to_whom'].map(to_whom_map).astype(int)

# # Combine training and validation data for cross-validation
combined_df = pd.concat([train_df, validation_df], ignore_index=True)
combined_dataset = Dataset.from_pandas(combined_df)

# # Load test data separately
test_df = pd.read_csv(test_file, sep='\t')
test_dataset = Dataset.from_pandas(test_df)

# Create initial dataset dict for preprocessing
raw_datasets = DatasetDict({
    "combined": combined_dataset,
    "test": test_dataset
})

for key in raw_datasets.keys():
    logger.info(f"loading a local file for {key}")
    
print(f"Combined dataset size: {len(combined_dataset)}")
print(f"Test dataset size: {len(test_df)}")
len(test_df['id'])
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
    revision="main",
    use_auth_token=None,
)

class MultiTaskModel(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        hidden_size = self.config.hidden_size
        self.hate_type_head = torch.nn.Linear(hidden_size, len(hate_type_map))
        self.severity_head = torch.nn.Linear(hidden_size, len(severity_map))
        self.to_whom_head = torch.nn.Linear(hidden_size, len(to_whom_map))

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        hate_type_logits = self.hate_type_head(pooled_output)
        severity_logits = self.severity_head(pooled_output)
        to_whom_logits = self.to_whom_head(pooled_output)
        loss = None
        if labels is not None:
            hate_type_labels = labels[:, 0]
            severity_labels = labels[:, 1]
            to_whom_labels = labels[:, 2]
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(hate_type_logits, hate_type_labels.long()) + \
                   loss_fct(severity_logits, severity_labels.long()) + \
                   loss_fct(to_whom_logits, to_whom_labels.long())
        return SequenceClassifierOutput(
            loss=loss,
            logits=(hate_type_logits, severity_logits, to_whom_logits),
        )

non_label_column_names = [name for name in raw_datasets["combined"].column_names if name != "labels"]
sentence1_key= 'text'

# Padding strategy
padding = "max_length"

if max_seq_length > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({max_seq_length}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
    )
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples[sentence1_key],))
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
    if "hate_type" in examples:
        result["labels"] = [[l1, l2, l3] for l1, l2, l3 in zip(examples["hate_type"], examples["hate_severity"], examples["to_whom"])]
    return result

# Preprocess the datasets
raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)
# Prepare combined dataset for cross-validation
combined_dataset = raw_datasets["combined"]
predict_dataset = raw_datasets["test"]

# Extract features and labels for StratifiedKFold
# For stratification, use hate_type_label as primary, since it has more classes
X = np.arange(len(combined_dataset))  # Dummy, since we select indices
y = np.array(combined_dataset["hate_type"])  # Stratify on hate_type

print(f"Total samples for cross-validation: {len(y)}")
print(f"Label distribution: {np.bincount(y)}")

# Initialize StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for each fold
fold_results = []
fold_probs = []
# Cross-validation loop
accuracy = evaluate.load("accuracy")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def compute_metrics(p: EvalPrediction):
    hate_preds = np.argmax(p.predictions[0], axis=1)
    sev_preds = np.argmax(p.predictions[1], axis=1)
    to_preds = np.argmax(p.predictions[2], axis=1)
    hate_labels = p.label_ids[:,0]
    sev_labels = p.label_ids[:,1]
    to_labels = p.label_ids[:,2]
    hate_acc = accuracy.compute(predictions=hate_preds, references=hate_labels)['accuracy']
    sev_acc = accuracy.compute(predictions=sev_preds, references=sev_labels)['accuracy']
    to_acc = accuracy.compute(predictions=to_preds, references=to_labels)['accuracy']
    return {'hate_accuracy': hate_acc, 'severity_accuracy': sev_acc, 'to_whom_accuracy': to_acc}

class FreeLB():
    """
    FreeLB: Free Large-Batch Adversarial Training
    Based on the paper: https://arxiv.org/abs/1909.11764
    """
    def __init__(self, model, epsilon=0.3, K=3, alpha=0.01):
        """
        Args:
            model: The model to apply adversarial training
            epsilon: Maximum perturbation norm (default: 0.3)
            K: Number of adversarial steps (default: 3)
            alpha: Step size for each adversarial step (default: 0.01)
        """
        self.model = model
        self.epsilon = epsilon
        self.K = K
        self.alpha = alpha
        self.backup = {}
        
    def initialize_delta(self, input_shape, device, uniform_init=True):
        """Initialize adversarial perturbation."""
        if uniform_init:
            # Initialize from uniform distribution as per FreeLB paper
            delta = torch.zeros(input_shape).uniform_(-self.epsilon, self.epsilon).to(device)
        else:
            delta = torch.zeros(input_shape).to(device)
        delta.requires_grad = True
        return delta
    
    def get_embeddings(self, input_ids):
        """Get word embeddings from input_ids."""
        embeddings = None
        for module in self.model.modules():
            if hasattr(module, 'word_embeddings'):
                embeddings = module.word_embeddings(input_ids)
                break
        return embeddings
    
    def project_delta(self, delta):
        """Project delta to epsilon ball using infinity norm."""
        return torch.clamp(delta, -self.epsilon, self.epsilon)

class SimplifiedFreeLBTrainer(Trainer):
    """Simplified FreeLB for memory-constrained environments"""
    def __init__(self, *args, epsilon=0.1, **kwargs):
        super().__init__(*args, **kwargs)
        self.epsilon = epsilon
        
    def training_step(self, model: torch.nn.Module, inputs: dict, num_items_in_batch: Optional[int] = None) -> torch.Tensor:
        """Simplified single-step FreeLB (similar to FGM but with better normalization)"""
        model.train()
        inputs = self._prepare_inputs(inputs)
        
        # Standard forward pass first
        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)
        
        if self.args.n_gpu > 1:
            loss = loss.mean()
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps
            
        # Backward to get gradients for perturbation
        self.accelerator.backward(loss)
        
        # Get embeddings and create perturbation based on gradients
        unwrapped_model = model.module if hasattr(model, 'module') else model
        
        # Simple adversarial step
        embedding_backup = {}
        for name, param in unwrapped_model.named_parameters():
            if param.requires_grad and 'embeddings' in name and 'word_embeddings' in name:
                embedding_backup[name] = param.data.clone()
                if param.grad is not None:
                    norm = torch.norm(param.grad)
                    if norm > 0:
                        r_adv = self.epsilon * param.grad / norm
                        param.data.add_(r_adv)
        
        # Forward with adversarial examples
        with self.compute_loss_context_manager():
            loss_adv = self.compute_loss(model, inputs)
        
        if self.args.n_gpu > 1:
            loss_adv = loss_adv.mean()
        if self.args.gradient_accumulation_steps > 1:
            loss_adv = loss_adv / self.args.gradient_accumulation_steps
        
        # Accumulate adversarial gradients
        self.accelerator.backward(loss_adv)
        
        # Restore embeddings
        for name, param in unwrapped_model.named_parameters():
            if name in embedding_backup:
                param.data = embedding_backup[name]
        
        return (loss + loss_adv) / 2

class CustomTrainer(Trainer):
    def __init__(self, *args, freelb_epsilon=0.3, freelb_K=3, freelb_alpha=0.01, **kwargs):
        """
        Custom Trainer with FreeLB adversarial training.
        
        Args:
            freelb_epsilon: Maximum perturbation norm for FreeLB (default: 0.3)
            freelb_K: Number of adversarial steps for FreeLB (default: 3)
            freelb_alpha: Step size for each adversarial step (default: 0.01)
        """
        super().__init__(*args, **kwargs)
        self.freelb_epsilon = freelb_epsilon
        self.freelb_K = freelb_K
        self.freelb_alpha = freelb_alpha
        
    def training_step(self, model: torch.nn.Module, inputs: dict, num_items_in_batch: Optional[int] = None) -> torch.Tensor:
        """
        Memory-efficient FreeLB training step implementation.
        """
        model.train()
        inputs = self._prepare_inputs(inputs)
        
        # Handle DataParallel and DistributedDataParallel wrappers
        unwrapped_model = model
        while hasattr(unwrapped_model, 'module'):
            unwrapped_model = unwrapped_model.module
        
        # Get initial embeddings
        input_ids = inputs.get('input_ids')
        attention_mask = inputs.get('attention_mask', None)
        labels = inputs.get('labels', None)
        
        # Find embedding layer more efficiently
        base_model = unwrapped_model.base_model
        if hasattr(base_model, 'embeddings'):
            if hasattr(base_model.embeddings, 'word_embeddings'):
                embedding_layer = base_model.embeddings.word_embeddings
            else:
                embedding_layer = base_model.embeddings.word_embeddings
        else:
            # Fallback
            for module in base_model.modules():
                if hasattr(module, 'word_embeddings'):
                    embedding_layer = module.word_embeddings
                    break
        
        # Get clean embeddings
        with torch.no_grad():
            word_embeddings = embedding_layer(input_ids)
        
        # Initialize delta with smaller memory footprint
        delta = torch.zeros_like(word_embeddings).uniform_(-self.freelb_epsilon, self.freelb_epsilon)
        delta.requires_grad_(True)
        
        # Accumulate gradients over K steps
        total_loss = 0.0
        
        for step in range(self.freelb_K):
            # Clear gradients from previous step if needed
            if step > 0:
                delta.grad = None
            
            # Apply perturbation
            perturbed_embeddings = word_embeddings + delta
            
            # Forward pass with gradient checkpointing for memory efficiency
            with torch.cuda.amp.autocast(enabled=False):  # Disable for stability
                outputs = base_model(inputs_embeds=perturbed_embeddings, attention_mask=attention_mask)
                pooled_output = outputs.last_hidden_state[:, 0, :]
                
                # Get predictions
                hate_type_logits = unwrapped_model.hate_type_head(pooled_output)
                severity_logits = unwrapped_model.severity_head(pooled_output)
                to_whom_logits = unwrapped_model.to_whom_head(pooled_output)
                
                # Calculate loss
                if labels is not None:
                    loss_fct = torch.nn.CrossEntropyLoss()
                    hate_loss = loss_fct(hate_type_logits, labels[:, 0].long())
                    sev_loss = loss_fct(severity_logits, labels[:, 1].long())
                    to_loss = loss_fct(to_whom_logits, labels[:, 2].long())
                    loss = (hate_loss + sev_loss + to_loss) / self.freelb_K
                    
                    if self.args.gradient_accumulation_steps > 1:
                        loss = loss / self.args.gradient_accumulation_steps
                    
                    if self.args.n_gpu > 1:
                        loss = loss.mean()
            
            total_loss += loss.item()
            
            # Backward pass
            if self.do_grad_scaling:
                loss = self.scaler.scale(loss)
            self.accelerator.backward(loss)
            
            # Update delta for next step (except last)
            if step < self.freelb_K - 1:
                with torch.no_grad():
                    # Get gradient and detach
                    if delta.grad is not None:
                        delta_grad = delta.grad.data
                        
                        # Normalize and update
                        norm = torch.norm(delta_grad.view(delta_grad.size(0), -1), dim=1, keepdim=True)
                        norm = norm.view(-1, 1, 1)
                        norm = torch.clamp(norm, min=1e-8)
                        
                        # Update delta
                        delta = (delta + self.freelb_alpha * delta_grad / norm).detach()
                        
                        # Project to epsilon ball
                        delta = torch.clamp(delta, -self.freelb_epsilon, self.freelb_epsilon)
                        delta.requires_grad_(True)
        
        return torch.tensor(total_loss, device=model.device)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{n_splits}")
    print(f"{'='*50}")
    
    # Create train and validation datasets for this fold
    train_dataset = combined_dataset.select(train_idx.tolist())
    val_dataset = combined_dataset.select(val_idx.tolist())
    
    # Remove ID columns
    train_dataset = train_dataset.remove_columns("id") if "id" in train_dataset.column_names else train_dataset
    val_dataset = val_dataset.remove_columns("id") if "id" in val_dataset.column_names else val_dataset
    
    print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")
    
    # Initialize model for this fold (fresh model each time)
    model = MultiTaskModel(model_name)
    
    # Update training arguments for this fold
    fold_training_args = TrainingArguments(
        learning_rate=2e-5,
        num_train_epochs=1,
        per_device_train_batch_size=8,  # Reduced for FreeLB memory efficiency
        per_device_eval_batch_size=8,   # Reduced for FreeLB memory efficiency
        gradient_accumulation_steps=2,   # Accumulate gradients to simulate batch_size=16
        output_dir=f"./distilBERT_fold_{fold+1}/",
        overwrite_output_dir=True,
        remove_unused_columns=True,  # Changed to True to fix the tensor conversion error
        local_rank=1,
        load_best_model_at_end=True,
        save_total_limit=1,
        save_strategy="epoch",
        eval_strategy="epoch",
        logging_strategy="epoch",
        report_to=None,
        fp16=True,  # Enable mixed precision to save memory
        seed=42 + fold  # Different seed for each fold
    )
    
    # Initialize trainer for this fold with FreeLB parameters
    if FREELB_CONFIG['enabled']:
        print(f"Using Full FreeLB adversarial training with epsilon={FREELB_CONFIG['epsilon']}, K={FREELB_CONFIG['K']}, alpha={FREELB_CONFIG['alpha']}")
        trainer = CustomTrainer(
            model=model,
            args=fold_training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            data_collator=data_collator,
            freelb_epsilon=FREELB_CONFIG['epsilon'],
            freelb_K=FREELB_CONFIG['K'],
            freelb_alpha=FREELB_CONFIG['alpha'],
        )
    elif FREELB_LIGHT_CONFIG['enabled']:
        print(f"Using Simplified FreeLB (memory-efficient) with epsilon={FREELB_LIGHT_CONFIG['epsilon']}")
        trainer = SimplifiedFreeLBTrainer(
            model=model,
            args=fold_training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            data_collator=data_collator,
            epsilon=FREELB_LIGHT_CONFIG['epsilon'],
        )
    else:
        print("Using standard training (FreeLB disabled)")
        trainer = Trainer(
            model=model,
            args=fold_training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
    
    # Train the model
    print(f"Training fold {fold + 1}...")
    train_result = trainer.train()
    
    # Evaluate the model
    print(f"Evaluating fold {fold + 1}...")
    eval_result = trainer.evaluate()
    
    # Store results
    fold_results.append({
        'fold': fold + 1,
        'train_loss': train_result.metrics['train_loss'],
        'eval_loss': eval_result['eval_loss'],
        'eval_hate_accuracy': eval_result['eval_hate_accuracy'],
        'eval_severity_accuracy': eval_result['eval_severity_accuracy'],
        'eval_to_whom_accuracy': eval_result['eval_to_whom_accuracy']
    })
    
    # Generate predictions on test set for this fold
    print(f"Predicting with fold {fold + 1} model...")
    test_predictions = trainer.predict(predict_dataset.remove_columns("id") if "id" in predict_dataset.column_names else predict_dataset)
    probs = [torch.softmax(torch.tensor(logits), dim=-1).numpy() for logits in test_predictions.predictions]
    fold_probs.append(probs)
    
    # Clean up to save memory
    del model, trainer
    
    print(f"Fold {fold + 1} - Hate Accuracy: {eval_result['eval_hate_accuracy']:.4f}, Severity Accuracy: {eval_result['eval_severity_accuracy']:.4f}, To Whom Accuracy: {eval_result['eval_to_whom_accuracy']:.4f}")
    
print(f"\n{'='*50}")
print("CROSS-VALIDATION COMPLETED")
print(f"{'='*50}")
# Analyze cross-validation results
import pandas as pd

results_df = pd.DataFrame(fold_results)
print("\nCross-Validation Results:")
print(results_df)

# Calculate average performance metrics
avg_train_loss = results_df['train_loss'].mean()
avg_eval_loss = results_df['eval_loss'].mean()
avg_hate_acc = results_df['eval_hate_accuracy'].mean()
avg_sev_acc = results_df['eval_severity_accuracy'].mean()
avg_to_acc = results_df['eval_to_whom_accuracy'].mean()
std_hate_acc = results_df['eval_hate_accuracy'].std()
std_sev_acc = results_df['eval_severity_accuracy'].std()
std_to_acc = results_df['eval_to_whom_accuracy'].std()

print(f"\nAverage Results Across {n_splits} Folds:")
print(f"Average Training Loss: {avg_train_loss:.4f}")
print(f"Average Validation Loss: {avg_eval_loss:.4f}")
print(f"Average Hate Type Accuracy: {avg_hate_acc:.4f} ± {std_hate_acc:.4f}")
print(f"Average Severity Accuracy: {avg_sev_acc:.4f} ± {std_sev_acc:.4f}")
print(f"Average To Whom Accuracy: {avg_to_acc:.4f} ± {std_to_acc:.4f}")

# Ensemble predictions
hate_probs_folds = np.array([probs[0] for probs in fold_probs])
sev_probs_folds = np.array([probs[1] for probs in fold_probs])
to_probs_folds = np.array([probs[2] for probs in fold_probs])
ensemble_probs = (np.mean(hate_probs_folds, axis=0),
                  np.mean(sev_probs_folds, axis=0),
                  np.mean(to_probs_folds, axis=0))
# np.save('ensemble_probs_aug20.npy', np.array(ensemble_probs, dtype=object))
# Final ensemble prediction
hate_probs, sev_probs, to_probs = ensemble_probs
final_hate_preds = np.argmax(hate_probs, axis=1)
final_sev_preds = np.argmax(sev_probs, axis=1)
final_to_preds = np.argmax(to_probs, axis=1)

# Generate predictions


# # Also save the ensemble predictions with different format for comparison
# submission_df = pd.DataFrame({'id': test_df['id'], 'hate_type': [id2hate[p] for p in final_hate_preds], 'hate_severity': [id2sev[p] for p in final_sev_preds], 'to_whom': [id2to[p] for p in final_to_preds]})
# submission_df.to_csv('ensemble_submission.tsv', sep='\t', index=False)

# print("Ensemble predictions also saved to 'ensemble_submission.tsv'")

import os
os.makedirs(training_args.output_dir, exist_ok=True)
logger.info("*** Predict ***")
ids = test_df['id']
output_predict_file = os.path.join(training_args.output_dir, f"subtask_1C.tsv")

# Write predictions in the required format
with open(output_predict_file, "w") as writer:
    logger.info(f"***** Predict results *****")
    writer.write("id\thate_type\thate_severity\tto_whom\tmodel\n")
    for index in range(len(final_hate_preds)):
        h = id2hate[final_hate_preds[index]]
        s = id2sev[final_sev_preds[index]]
        t = id2to[final_to_preds[index]]
        writer.write(f"{ids[index]}\t{h}\t{s}\t{t}\t{model_name}\n")

print(f"\nPredictions saved to '{output_predict_file}'")

--2025-08-27 08:20:23--  https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 634005 (619K) [text/plain]
Saving to: ‘blp25_hatespeech_subtask_1C_dev.tsv’


2025-08-27 08:20:23 (15.0 MB/s) - ‘blp25_hatespeech_subtask_1C_dev.tsv’ saved [634005/634005]

--2025-08-27 08:20:23--  https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_dev_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP r

2025-08-27 08:22:10.751596: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756282930.938673      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756282930.991361      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Combined dataset size: 38034
Test dataset size: 2512


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

[INFO|configuration_utils.py:698] 2025-08-27 08:22:28,615 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/config.json
[INFO|configuration_utils.py:770] 2025-08-27 08:22:28,622 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2023] 2025-08-27 08:22:29,102 >> loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/vocab.txt
[INFO|tokenization_utils_base.py:2023] 2025-08-27 08:22:29,103 >> loading file tokenizer.json from cache at None
[INFO|tokenization_utils_base.py:2023] 2025-08-27 08:22:29,104 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2023] 2025-08-27 08:22:29,104 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/special_tokens_map.json
[INFO|tokenization_utils_base.py:2023] 2025-08-27 08:22:29,105 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/tokenizer_config.json
[INFO|tokenization_utils_base.py:2023] 2025-08-

Running tokenizer on dataset:   0%|          | 0/38034 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/2512 [00:00<?, ? examples/s]

Total samples for cross-validation: 38034
Label distribution: [21405   714   133  4518  2488  8776]


Downloading builder script: 0.00B [00:00, ?B/s]


FOLD 1/5
Train size: 30427, Validation size: 7607


[INFO|configuration_utils.py:698] 2025-08-27 08:22:43,251 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/config.json
[INFO|configuration_utils.py:770] 2025-08-27 08:22:43,252 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

[INFO|modeling_utils.py:1151] 2025-08-27 08:22:51,111 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/pytorch_model.bin
[INFO|safetensors_conversion.py:61] 2025-08-27 08:22:51,191 >> Attempting to create safetensors variant
[INFO|safetensors_conversion.py:74] 2025-08-27 08:22:51,482 >> Safetensors PR exists


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

[INFO|modeling_utils.py:5121] 2025-08-27 08:22:51,801 >> Some weights of the model checkpoint at csebuetnlp/banglabert were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[INFO|modeling_utils.py:5139] 2025-08-27 08:22:51,801 >> All the weights of ElectraModel were initialized from the model checkpoint at csebuetnlp/banglabert.
If your task is similar to the 

Using Simplified FreeLB (memory-efficient) with epsilon=0.1


[INFO|trainer.py:756] 2025-08-27 08:22:53,242 >> Using auto half precision backend
[INFO|trainer.py:934] 2025-08-27 08:22:54,003 >> The following columns in the Training set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:2409] 2025-08-27 08:22:54,042 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-08-27 08:22:54,043 >>   Num examples = 30,427
[INFO|trainer.py:2411] 2025-08-27 08:22:54,044 >>   Num Epochs = 1
[INFO|trainer.py:2412] 2025-08-27 08:22:54,046 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2414] 2025-08-27 08:22:54,047 >>   Training with DataParallel so batch size has been adjusted to: 16
[INFO|trainer.py:2415] 2025-08-27 08:22:54,047 >>   Total train batch size (w. parallel, distributed & accumul

Training fold 1...




Epoch,Training Loss,Validation Loss,Hate Accuracy,Severity Accuracy,To Whom Accuracy
1,2.4583,2.152074,0.7033,0.746286,0.714605


[INFO|trainer.py:934] 2025-08-27 09:20:54,774 >> The following columns in the Evaluation set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 09:20:54,780 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-08-27 09:20:54,780 >>   Num examples = 7607
[INFO|trainer.py:4332] 2025-08-27 09:20:54,781 >>   Batch size = 16
[INFO|trainer.py:3993] 2025-08-27 09:23:46,723 >> Saving model checkpoint to ./distilBERT_fold_1/checkpoint-951
[INFO|trainer.py:4007] 2025-08-27 09:23:46,727 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.
[INFO|tokenization_utils_base.py:2525] 2025-08-27 09:23:47,717 >> tokenizer config file saved in ./distilBERT_fold_1/checkpoint-951/tokenizer_config.json
[INFO|tokeniz

Evaluating fold 1...




[INFO|trainer.py:934] 2025-08-27 09:26:43,669 >> The following columns in the test set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 09:26:43,673 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-08-27 09:26:43,674 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-08-27 09:26:43,674 >>   Batch size = 16


Predicting with fold 1 model...


[INFO|configuration_utils.py:698] 2025-08-27 09:27:40,430 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/config.json
[INFO|configuration_utils.py:770] 2025-08-27 09:27:40,431 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_

Fold 1 - Hate Accuracy: 0.7033, Severity Accuracy: 0.7463, To Whom Accuracy: 0.7146

FOLD 2/5
Train size: 30427, Validation size: 7607


[INFO|modeling_utils.py:1151] 2025-08-27 09:27:40,482 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/pytorch_model.bin
[INFO|safetensors_conversion.py:61] 2025-08-27 09:27:40,608 >> Attempting to create safetensors variant
[INFO|modeling_utils.py:5121] 2025-08-27 09:27:40,914 >> Some weights of the model checkpoint at csebuetnlp/banglabert were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the check

Using Simplified FreeLB (memory-efficient) with epsilon=0.1
Training fold 2...


[INFO|trainer.py:934] 2025-08-27 09:27:41,868 >> The following columns in the Training set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:2409] 2025-08-27 09:27:41,876 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-08-27 09:27:41,877 >>   Num examples = 30,427
[INFO|trainer.py:2411] 2025-08-27 09:27:41,877 >>   Num Epochs = 1
[INFO|trainer.py:2412] 2025-08-27 09:27:41,878 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2414] 2025-08-27 09:27:41,878 >>   Training with DataParallel so batch size has been adjusted to: 16
[INFO|trainer.py:2415] 2025-08-27 09:27:41,879 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-08-27 09:27:41,880 >>   Gradient Accumulati

Epoch,Training Loss,Validation Loss,Hate Accuracy,Severity Accuracy,To Whom Accuracy
1,2.4779,2.159992,0.712896,0.740108,0.71053


[INFO|trainer.py:934] 2025-08-27 10:25:53,210 >> The following columns in the Evaluation set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 10:25:53,216 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-08-27 10:25:53,216 >>   Num examples = 7607
[INFO|trainer.py:4332] 2025-08-27 10:25:53,217 >>   Batch size = 16
[INFO|trainer.py:3993] 2025-08-27 10:28:45,447 >> Saving model checkpoint to ./distilBERT_fold_2/checkpoint-951
[INFO|trainer.py:4007] 2025-08-27 10:28:45,450 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.
[INFO|tokenization_utils_base.py:2525] 2025-08-27 10:28:46,494 >> tokenizer config file saved in ./distilBERT_fold_2/checkpoint-951/tokenizer_config.json
[INFO|tokeniz

Evaluating fold 2...




[INFO|trainer.py:934] 2025-08-27 10:31:43,372 >> The following columns in the test set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 10:31:43,377 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-08-27 10:31:43,378 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-08-27 10:31:43,378 >>   Batch size = 16


Predicting with fold 2 model...
Fold 2 - Hate Accuracy: 0.7129, Severity Accuracy: 0.7401, To Whom Accuracy: 0.7105

FOLD 3/5
Train size: 30427, Validation size: 7607


[INFO|configuration_utils.py:698] 2025-08-27 10:32:40,578 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/config.json
[INFO|configuration_utils.py:770] 2025-08-27 10:32:40,579 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_

Using Simplified FreeLB (memory-efficient) with epsilon=0.1
Training fold 3...


[INFO|trainer.py:934] 2025-08-27 10:32:42,124 >> The following columns in the Training set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:2409] 2025-08-27 10:32:42,132 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-08-27 10:32:42,133 >>   Num examples = 30,427
[INFO|trainer.py:2411] 2025-08-27 10:32:42,133 >>   Num Epochs = 1
[INFO|trainer.py:2412] 2025-08-27 10:32:42,134 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2414] 2025-08-27 10:32:42,135 >>   Training with DataParallel so batch size has been adjusted to: 16
[INFO|trainer.py:2415] 2025-08-27 10:32:42,135 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-08-27 10:32:42,136 >>   Gradient Accumulati

Epoch,Training Loss,Validation Loss,Hate Accuracy,Severity Accuracy,To Whom Accuracy
1,2.4424,2.177186,0.701065,0.7313,0.708032


[INFO|trainer.py:934] 2025-08-27 11:30:53,430 >> The following columns in the Evaluation set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 11:30:53,436 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-08-27 11:30:53,436 >>   Num examples = 7607
[INFO|trainer.py:4332] 2025-08-27 11:30:53,437 >>   Batch size = 16
[INFO|trainer.py:3993] 2025-08-27 11:33:45,625 >> Saving model checkpoint to ./distilBERT_fold_3/checkpoint-951
[INFO|trainer.py:4007] 2025-08-27 11:33:45,628 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.
[INFO|tokenization_utils_base.py:2525] 2025-08-27 11:33:46,655 >> tokenizer config file saved in ./distilBERT_fold_3/checkpoint-951/tokenizer_config.json
[INFO|tokeniz

Evaluating fold 3...




[INFO|trainer.py:934] 2025-08-27 11:36:44,401 >> The following columns in the test set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 11:36:44,406 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-08-27 11:36:44,407 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-08-27 11:36:44,408 >>   Batch size = 16


Predicting with fold 3 model...


[INFO|configuration_utils.py:698] 2025-08-27 11:37:41,679 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/config.json
[INFO|configuration_utils.py:770] 2025-08-27 11:37:41,681 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_

Fold 3 - Hate Accuracy: 0.7011, Severity Accuracy: 0.7313, To Whom Accuracy: 0.7080

FOLD 4/5
Train size: 30427, Validation size: 7607


[INFO|modeling_utils.py:1151] 2025-08-27 11:37:41,735 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/pytorch_model.bin
[INFO|safetensors_conversion.py:61] 2025-08-27 11:37:41,859 >> Attempting to create safetensors variant
[INFO|safetensors_conversion.py:74] 2025-08-27 11:37:42,190 >> Safetensors PR exists
[INFO|modeling_utils.py:5121] 2025-08-27 11:37:42,194 >> Some weights of the model checkpoint at csebuetnlp/banglabert were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraini

Using Simplified FreeLB (memory-efficient) with epsilon=0.1
Training fold 4...


[INFO|trainer.py:934] 2025-08-27 11:37:43,274 >> The following columns in the Training set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:2409] 2025-08-27 11:37:43,282 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-08-27 11:37:43,282 >>   Num examples = 30,427
[INFO|trainer.py:2411] 2025-08-27 11:37:43,283 >>   Num Epochs = 1
[INFO|trainer.py:2412] 2025-08-27 11:37:43,283 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2414] 2025-08-27 11:37:43,284 >>   Training with DataParallel so batch size has been adjusted to: 16
[INFO|trainer.py:2415] 2025-08-27 11:37:43,285 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-08-27 11:37:43,285 >>   Gradient Accumulati

Epoch,Training Loss,Validation Loss,Hate Accuracy,Severity Accuracy,To Whom Accuracy
1,2.4788,2.181624,0.707506,0.739188,0.70698


[INFO|trainer.py:934] 2025-08-27 12:35:55,099 >> The following columns in the Evaluation set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 12:35:55,105 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-08-27 12:35:55,105 >>   Num examples = 7607
[INFO|trainer.py:4332] 2025-08-27 12:35:55,106 >>   Batch size = 16
[INFO|trainer.py:3993] 2025-08-27 12:38:46,612 >> Saving model checkpoint to ./distilBERT_fold_4/checkpoint-951
[INFO|trainer.py:4007] 2025-08-27 12:38:46,615 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.
[INFO|tokenization_utils_base.py:2525] 2025-08-27 12:38:47,605 >> tokenizer config file saved in ./distilBERT_fold_4/checkpoint-951/tokenizer_config.json
[INFO|tokeniz

Evaluating fold 4...




[INFO|trainer.py:934] 2025-08-27 12:41:44,075 >> The following columns in the test set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 12:41:44,079 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-08-27 12:41:44,080 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-08-27 12:41:44,081 >>   Batch size = 16


Predicting with fold 4 model...


[INFO|configuration_utils.py:698] 2025-08-27 12:42:41,076 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/config.json
[INFO|configuration_utils.py:770] 2025-08-27 12:42:41,077 >> Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_

Fold 4 - Hate Accuracy: 0.7075, Severity Accuracy: 0.7392, To Whom Accuracy: 0.7070

FOLD 5/5
Train size: 30428, Validation size: 7606


[INFO|modeling_utils.py:1151] 2025-08-27 12:42:41,131 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert/snapshots/9ce791f330578f50da6bc52b54205166fb5d1c8c/pytorch_model.bin
[INFO|safetensors_conversion.py:61] 2025-08-27 12:42:41,211 >> Attempting to create safetensors variant
[INFO|safetensors_conversion.py:74] 2025-08-27 12:42:41,394 >> Safetensors PR exists
[INFO|modeling_utils.py:5121] 2025-08-27 12:42:41,580 >> Some weights of the model checkpoint at csebuetnlp/banglabert were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraini

Using Simplified FreeLB (memory-efficient) with epsilon=0.1
Training fold 5...


[INFO|trainer.py:934] 2025-08-27 12:42:42,874 >> The following columns in the Training set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:2409] 2025-08-27 12:42:42,887 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-08-27 12:42:42,888 >>   Num examples = 30,428
[INFO|trainer.py:2411] 2025-08-27 12:42:42,889 >>   Num Epochs = 1
[INFO|trainer.py:2412] 2025-08-27 12:42:42,889 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:2414] 2025-08-27 12:42:42,890 >>   Training with DataParallel so batch size has been adjusted to: 16
[INFO|trainer.py:2415] 2025-08-27 12:42:42,891 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-08-27 12:42:42,892 >>   Gradient Accumulati

Epoch,Training Loss,Validation Loss,Hate Accuracy,Severity Accuracy,To Whom Accuracy
1,2.4709,2.16053,0.709966,0.746253,0.707599


[INFO|trainer.py:934] 2025-08-27 13:40:50,104 >> The following columns in the Evaluation set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: text, to_whom, token_type_ids, hate_type, hate_severity. If text, to_whom, token_type_ids, hate_type, hate_severity are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 13:40:50,110 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-08-27 13:40:50,110 >>   Num examples = 7606
[INFO|trainer.py:4332] 2025-08-27 13:40:50,112 >>   Batch size = 16
[INFO|trainer.py:3993] 2025-08-27 13:43:41,338 >> Saving model checkpoint to ./distilBERT_fold_5/checkpoint-951
[INFO|trainer.py:4007] 2025-08-27 13:43:41,341 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.
[INFO|tokenization_utils_base.py:2525] 2025-08-27 13:43:42,310 >> tokenizer config file saved in ./distilBERT_fold_5/checkpoint-951/tokenizer_config.json
[INFO|tokeniz

Evaluating fold 5...




[INFO|trainer.py:934] 2025-08-27 13:46:38,635 >> The following columns in the test set don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `MultiTaskModel.forward`,  you can safely ignore this message.
[INFO|trainer.py:4327] 2025-08-27 13:46:38,640 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-08-27 13:46:38,641 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-08-27 13:46:38,641 >>   Batch size = 16


Predicting with fold 5 model...
Fold 5 - Hate Accuracy: 0.7100, Severity Accuracy: 0.7463, To Whom Accuracy: 0.7076

CROSS-VALIDATION COMPLETED

Cross-Validation Results:
   fold  train_loss  eval_loss  eval_hate_accuracy  eval_severity_accuracy  \
0     1    2.458296   2.152074            0.703300                0.746286   
1     2    2.477862   2.159992            0.712896                0.740108   
2     3    2.442395   2.177186            0.701065                0.731300   
3     4    2.478844   2.181624            0.707506                0.739188   
4     5    2.470876   2.160530            0.709966                0.746253   

   eval_to_whom_accuracy  
0               0.714605  
1               0.710530  
2               0.708032  
3               0.706980  
4               0.707599  

Average Results Across 5 Folds:
Average Training Loss: 2.4657
Average Validation Loss: 2.1663
Average Hate Type Accuracy: 0.7069 ± 0.0048
Average Severity Accuracy: 0.7406 ± 0.0062
Average To Whom 