In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_dev.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_dev_test.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_train.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1C/blp25_hatespeech_subtask_1C_test.tsv

!pip install transformers
!pip install datasets
!pip install evaluate
!pip install torch
!pip install scikit-learn
# !pip install --upgrade accelerate
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import datasets
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import torch
from sklearn.model_selection import StratifiedKFold

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from transformers.modeling_outputs import SequenceClassifierOutput


logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
train_file = 'blp25_hatespeech_subtask_1C_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1C_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1C_test.tsv'
import os
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    learning_rate=2e-5,
    num_train_epochs=2,  # Reduced to save memory
    per_device_train_batch_size=4,  # Small batch size for FreeLB memory requirements
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # Use gradient accumulation to maintain effective batch size
    output_dir="./freelb_banglabert/",
    overwrite_output_dir=True,
    remove_unused_columns=False,
    local_rank= 1,
    load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="no",
    report_to=None,
    warmup_ratio=0.1,  # Add warmup as recommended in FreeLB paper
    weight_decay=0.01,  # Add weight decay for better regularization
    fp16=False  # Disable FP16 for FreeLB compatibility
)

max_train_samples = None
max_eval_samples=None
max_predict_samples=None
max_seq_length = 512
batch_size = 16
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
model_name = 'csebuetnlp/banglabert'
set_seed(training_args.seed)
hate_type_map = {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
severity_map = {'Little to None': 0, 'Mild': 1, 'Severe': 2}
to_whom_map = {'None': 0, 'Individual': 1, 'Organization': 2, 'Community': 3, 'Society': 4}
id2hate = {v: k for k, v in hate_type_map.items()}
id2sev = {v: k for k, v in severity_map.items()}
id2to = {v: k for k, v in to_whom_map.items()}

# Load training and validation data
train_df = pd.read_csv(train_file, sep='\t')

train_df['hate_type'] = train_df['hate_type'].fillna('None')
train_df['to_whom'] = train_df['to_whom'].fillna('None')
train_df['hate_type'] = train_df['hate_type'].map(hate_type_map).astype(int)
train_df['hate_severity'] = train_df['hate_severity'].map(severity_map).astype(int)
train_df['to_whom'] = train_df['to_whom'].map(to_whom_map).astype(int)

validation_df = pd.read_csv(validation_file, sep='\t')
validation_df['hate_type'] = validation_df['hate_type'].replace('nan', 'None').fillna('None')
validation_df['to_whom'] = validation_df['to_whom'].replace('nan', 'None').fillna('None')
validation_df['hate_type'] = validation_df['hate_type'].map(hate_type_map).astype(int)
validation_df['hate_severity'] = validation_df['hate_severity'].map(severity_map).astype(int)
validation_df['to_whom'] = validation_df['to_whom'].map(to_whom_map).astype(int)

# # Combine training and validation data for cross-validation
combined_df = pd.concat([train_df, validation_df], ignore_index=True)
combined_dataset = Dataset.from_pandas(combined_df)

# # Load test data separately
test_df = pd.read_csv(test_file, sep='\t')
test_dataset = Dataset.from_pandas(test_df)

# Create initial dataset dict for preprocessing
raw_datasets = DatasetDict({
    "combined": combined_dataset,
    "test": test_dataset
})

for key in raw_datasets.keys():
    logger.info(f"loading a local file for {key}")
    
print(f"Combined dataset size: {len(combined_dataset)}")
print(f"Test dataset size: {len(test_df)}")
len(test_df['id'])
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
    revision="main",
    use_auth_token=None,
)

class MultiTaskModel(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        hidden_size = self.config.hidden_size
        self.hate_type_head = torch.nn.Linear(hidden_size, len(hate_type_map))
        self.severity_head = torch.nn.Linear(hidden_size, len(severity_map))
        self.to_whom_head = torch.nn.Linear(hidden_size, len(to_whom_map))

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        hate_type_logits = self.hate_type_head(pooled_output)
        severity_logits = self.severity_head(pooled_output)
        to_whom_logits = self.to_whom_head(pooled_output)
        loss = None
        if labels is not None:
            hate_type_labels = labels[:, 0]
            severity_labels = labels[:, 1]
            to_whom_labels = labels[:, 2]
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(hate_type_logits, hate_type_labels.long()) + \
                   loss_fct(severity_logits, severity_labels.long()) + \
                   loss_fct(to_whom_logits, to_whom_labels.long())
        return SequenceClassifierOutput(
            loss=loss,
            logits=(hate_type_logits, severity_logits, to_whom_logits),
        )

non_label_column_names = [name for name in raw_datasets["combined"].column_names if name != "labels"]
sentence1_key= 'text'

# Padding strategy
padding = "max_length"

if max_seq_length > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({max_seq_length}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
    )
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples[sentence1_key],))
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
    if "hate_type" in examples:
        result["labels"] = [[l1, l2, l3] for l1, l2, l3 in zip(examples["hate_type"], examples["hate_severity"], examples["to_whom"])]
    return result

# Preprocess the datasets
raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)
# Prepare combined dataset for cross-validation
combined_dataset = raw_datasets["combined"]
predict_dataset = raw_datasets["test"]

# Extract features and labels for StratifiedKFold
# For stratification, use hate_type_label as primary, since it has more classes
X = np.arange(len(combined_dataset))  # Dummy, since we select indices
y = np.array(combined_dataset["hate_type"])  # Stratify on hate_type

print(f"Total samples for cross-validation: {len(y)}")
print(f"Label distribution: {np.bincount(y)}")

# Initialize StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for each fold
fold_results = []
fold_probs = []
# Cross-validation loop
accuracy = evaluate.load("accuracy")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def compute_metrics(p: EvalPrediction):
    hate_preds = np.argmax(p.predictions[0], axis=1)
    sev_preds = np.argmax(p.predictions[1], axis=1)
    to_preds = np.argmax(p.predictions[2], axis=1)
    hate_labels = p.label_ids[:,0]
    sev_labels = p.label_ids[:,1]
    to_labels = p.label_ids[:,2]
    hate_acc = accuracy.compute(predictions=hate_preds, references=hate_labels)['accuracy']
    sev_acc = accuracy.compute(predictions=sev_preds, references=sev_labels)['accuracy']
    to_acc = accuracy.compute(predictions=to_preds, references=to_labels)['accuracy']
    return {'hate_accuracy': hate_acc, 'severity_accuracy': sev_acc, 'to_whom_accuracy': to_acc}

class FreeLB():
    def __init__(self, model):
        self.model = model
        self.backup = {}
    
    def attack(self, input_ids, attention_mask, labels, epsilon=0.3, alpha=0.01, K=3, emb_name='base_model.embeddings.word_embeddings'):
        """
        FreeLB adversarial training implementation
        Args:
            input_ids: input token ids
            attention_mask: attention mask
            labels: target labels  
            epsilon: maximum perturbation bound
            alpha: step size for gradient ascent
            K: number of PGD steps
            emb_name: name of embedding layer
        Returns:
            accumulated gradients
        """
        # Get word embeddings
        embeddings = None
        for name, module in self.model.named_modules():
            if 'word_embeddings' in name:
                embeddings = module
                break
        
        if embeddings is None:
            raise ValueError("Could not find word embeddings layer")
            
        # Get initial embeddings
        embeds_init = embeddings(input_ids)
        
        # Random initialization of perturbation
        if embeds_init.dtype == torch.float16:
            delta = torch.zeros_like(embeds_init).uniform_(-epsilon, epsilon).half()
        else:
            delta = torch.zeros_like(embeds_init).uniform_(-epsilon, epsilon)
        delta.requires_grad_()
        
        # Accumulate gradients
        total_grad = 0
        
        for step in range(K):
            # Add perturbation to embeddings
            inputs_embeds = embeds_init + delta
            
            # Forward pass with perturbed embeddings
            outputs = self.model(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            
            # Scale loss by 1/K for gradient accumulation
            loss = loss / K
            
            # Backward pass
            loss.backward(retain_graph=True)
            
            # Update perturbation via gradient ascent
            if step < K - 1:  # Don't update delta on last step
                grad = delta.grad.data
                norm = torch.norm(grad, p='fro')
                if norm != 0 and not torch.isnan(norm):
                    delta.data = delta.data + alpha * grad / norm
                    # Project back to epsilon ball
                    delta_norm = torch.norm(delta.data, p='fro')
                    if delta_norm > epsilon:
                        delta.data = epsilon * delta.data / delta_norm
                delta.grad.zero_()
        
        return embeds_init, delta

class CustomTrainer(Trainer):
    def __init__(self, *args, use_freelb=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_freelb = use_freelb
        
    def training_step(self, model: torch.nn.Module, inputs: dict, num_items_in_batch: Optional[int] = None) -> torch.Tensor:
        """
        Memory-optimized FreeLB implementation
        """
        model.train()
        inputs = self._prepare_inputs(inputs)
        
        # Check if FreeLB is enabled
        if not self.use_freelb:
            with self.compute_loss_context_manager():
                loss = self.compute_loss(model, inputs)
            if self.args.gradient_accumulation_steps > 1:
                loss = loss / self.args.gradient_accumulation_steps
            if self.args.n_gpu > 1:
                loss = loss.mean()
            self.accelerator.backward(loss)
            return loss.detach()
            
        # FreeLB with aggressive memory optimization
        K = 2  # Use only 2 steps to reduce memory
        epsilon = 0.2  # Smaller perturbation
        alpha = 0.02  # Adjusted step size
        
        # Try to get word embeddings layer
        word_embeddings = None
        if hasattr(model, 'base_model'):
            if hasattr(model.base_model, 'embeddings'):
                if hasattr(model.base_model.embeddings, 'word_embeddings'):
                    word_embeddings = model.base_model.embeddings.word_embeddings
        
        if word_embeddings is None:
            # Fallback to standard training if we can't find embeddings
            with self.compute_loss_context_manager():
                loss = self.compute_loss(model, inputs)
            if self.args.gradient_accumulation_steps > 1:
                loss = loss / self.args.gradient_accumulation_steps
            if self.args.n_gpu > 1:
                loss = loss.mean()
            self.accelerator.backward(loss)
            return loss.detach()
        
        # Proper FreeLB implementation following the paper
        try:
            input_ids = inputs.get('input_ids')
            batch_size = input_ids.size(0)
            
            # Get clean embeddings
            with torch.no_grad():
                embeds_init = word_embeddings(input_ids)
            
            # Initialize perturbation uniformly in [-ε, ε] as per FreeLB paper
            delta = torch.zeros_like(embeds_init).uniform_(-epsilon, epsilon)
            
            # Normalize to epsilon ball (Frobenius norm)
            delta_flat = delta.view(batch_size, -1)
            delta_norm = torch.norm(delta_flat, p='fro', dim=1, keepdim=True)
            delta = (delta_flat * epsilon / (delta_norm + 1e-8)).view_as(embeds_init)
            delta = delta.detach()
            delta.requires_grad = True
            
            # Clear gradients before FreeLB loop
            model.zero_grad()
            
            for step in range(K):
                # Create perturbed embeddings
                perturbed_embeds = embeds_init + delta
                
                # Hook to replace embeddings output
                def make_hook(perturbed):
                    def hook_fn(module, input, output):
                        # Only replace for our batch
                        return perturbed
                    return hook_fn
                
                hook = word_embeddings.register_forward_hook(make_hook(perturbed_embeds))
                
                try:
                    # Forward pass
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=inputs.get('attention_mask'),
                        labels=inputs.get('labels')
                    )
                    loss = outputs.loss
                    
                    # Scale loss by 1/K for averaging
                    scaled_loss = loss / K
                    
                    # Account for gradient accumulation
                    if self.args.gradient_accumulation_steps > 1:
                        scaled_loss = scaled_loss / self.args.gradient_accumulation_steps
                    
                    # Compute gradients with proper handling for mixed precision
                    # Use accelerator for proper gradient scaling with FP16
                    if hasattr(self, 'accelerator'):
                        self.accelerator.backward(scaled_loss, retain_graph=(step < K-1))
                    else:
                        scaled_loss.backward(retain_graph=(step < K-1))
                    
                    # Update adversarial perturbation (except on last step)
                    if step < K - 1 and delta.grad is not None:
                        # Gradient ascent on delta to maximize loss
                        grad = delta.grad.data
                        grad_flat = grad.view(batch_size, -1)
                        grad_norm = torch.norm(grad_flat, p='fro', dim=1, keepdim=True)
                        grad_normalized = (grad_flat / (grad_norm + 1e-8)).view_as(grad)
                        
                        # Update delta
                        delta = delta.detach() + alpha * grad_normalized
                        
                        # Project back to epsilon ball
                        delta_flat = delta.view(batch_size, -1)
                        delta_norm = torch.norm(delta_flat, p='fro', dim=1, keepdim=True)
                        delta = (delta_flat * epsilon / torch.clamp(delta_norm, min=epsilon)).view_as(embeds_init)
                        delta = delta.detach()
                        delta.requires_grad = True
                        
                        # Don't clear model gradients here - we're accumulating across K steps
                        
                finally:
                    hook.remove()
            
            # Return average loss for logging (loss is already from last step)
            return loss.detach()
            
        except Exception as e:
            # Fallback to standard training
            print(f"FreeLB error: {e}, using standard training")
            with self.compute_loss_context_manager():
                loss = self.compute_loss(model, inputs)
            if self.args.gradient_accumulation_steps > 1:
                loss = loss / self.args.gradient_accumulation_steps
            if self.args.n_gpu > 1:
                loss = loss.mean()
            self.accelerator.backward(loss)
            return loss.detach()

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{n_splits}")
    print(f"{'='*50}")
    
    # Create train and validation datasets for this fold
    train_dataset = combined_dataset.select(train_idx.tolist())
    val_dataset = combined_dataset.select(val_idx.tolist())
    
    # Remove ID columns
    train_dataset = train_dataset.remove_columns("id") if "id" in train_dataset.column_names else train_dataset
    val_dataset = val_dataset.remove_columns("id") if "id" in val_dataset.column_names else val_dataset
    
    print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")
    
    # Initialize model for this fold (fresh model each time)
    model = MultiTaskModel(model_name)
    
    # Update training arguments for this fold
    fold_training_args = TrainingArguments(
        learning_rate=2e-5,
        num_train_epochs=2,  # Reduced to save memory
        per_device_train_batch_size=4,  # Very small batch size for FreeLB
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,  # Maintain effective batch size of 16
        output_dir=f"./freelb_fold_{fold+1}/",
        overwrite_output_dir=True,
        remove_unused_columns=True,  # Changed to True to fix the tensor conversion error
        local_rank=1,
        load_best_model_at_end=True,
        save_total_limit=1,
        save_strategy="epoch",
        eval_strategy="epoch",
        logging_strategy="epoch",
        report_to=None,
        warmup_ratio=0.1,  # Add warmup as recommended
        weight_decay=0.01,  # Add weight decay for regularization
        fp16=False,  # Disable FP16 for FreeLB compatibility
        seed=42 + fold  # Different seed for each fold
    )
    
    # Initialize trainer for this fold - Enable FreeLB only for first fold to test
    trainer = CustomTrainer(
        model=model,
        args=fold_training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
        use_freelb=True  # Set to False if memory issues persist
    )
    
    # Train the model
    print(f"Training fold {fold + 1}...")
    train_result = trainer.train()
    
    # Evaluate the model
    print(f"Evaluating fold {fold + 1}...")
    eval_result = trainer.evaluate()
    
    # Store results
    fold_results.append({
        'fold': fold + 1,
        'train_loss': train_result.metrics['train_loss'],
        'eval_loss': eval_result['eval_loss'],
        'eval_hate_accuracy': eval_result['eval_hate_accuracy'],
        'eval_severity_accuracy': eval_result['eval_severity_accuracy'],
        'eval_to_whom_accuracy': eval_result['eval_to_whom_accuracy']
    })
    
    # Generate predictions on test set for this fold
    print(f"Predicting with fold {fold + 1} model...")
    test_predictions = trainer.predict(predict_dataset.remove_columns("id") if "id" in predict_dataset.column_names else predict_dataset)
    probs = [torch.softmax(torch.tensor(logits), dim=-1).numpy() for logits in test_predictions.predictions]
    fold_probs.append(probs)
    
    # Clean up to save memory
    del model, trainer
    
    print(f"Fold {fold + 1} - Hate Accuracy: {eval_result['eval_hate_accuracy']:.4f}, Severity Accuracy: {eval_result['eval_severity_accuracy']:.4f}, To Whom Accuracy: {eval_result['eval_to_whom_accuracy']:.4f}")
    
print(f"\n{'='*50}")
print("CROSS-VALIDATION COMPLETED")
print(f"{'='*50}")
# Analyze cross-validation results
import pandas as pd

results_df = pd.DataFrame(fold_results)
print("\nCross-Validation Results:")
print(results_df)

# Calculate average performance metrics
avg_train_loss = results_df['train_loss'].mean()
avg_eval_loss = results_df['eval_loss'].mean()
avg_hate_acc = results_df['eval_hate_accuracy'].mean()
avg_sev_acc = results_df['eval_severity_accuracy'].mean()
avg_to_acc = results_df['eval_to_whom_accuracy'].mean()
std_hate_acc = results_df['eval_hate_accuracy'].std()
std_sev_acc = results_df['eval_severity_accuracy'].std()
std_to_acc = results_df['eval_to_whom_accuracy'].std()

print(f"\nAverage Results Across {n_splits} Folds:")
print(f"Average Training Loss: {avg_train_loss:.4f}")
print(f"Average Validation Loss: {avg_eval_loss:.4f}")
print(f"Average Hate Type Accuracy: {avg_hate_acc:.4f} ± {std_hate_acc:.4f}")
print(f"Average Severity Accuracy: {avg_sev_acc:.4f} ± {std_sev_acc:.4f}")
print(f"Average To Whom Accuracy: {avg_to_acc:.4f} ± {std_to_acc:.4f}")

# Ensemble predictions
hate_probs_folds = np.array([probs[0] for probs in fold_probs])
sev_probs_folds = np.array([probs[1] for probs in fold_probs])
to_probs_folds = np.array([probs[2] for probs in fold_probs])
ensemble_probs = (np.mean(hate_probs_folds, axis=0),
                  np.mean(sev_probs_folds, axis=0),
                  np.mean(to_probs_folds, axis=0))
# np.save('ensemble_probs_aug20.npy', np.array(ensemble_probs, dtype=object))
# Final ensemble prediction
hate_probs, sev_probs, to_probs = ensemble_probs
final_hate_preds = np.argmax(hate_probs, axis=1)
final_sev_preds = np.argmax(sev_probs, axis=1)
final_to_preds = np.argmax(to_probs, axis=1)

# Generate predictions


# # Also save the ensemble predictions with different format for comparison
# submission_df = pd.DataFrame({'id': test_df['id'], 'hate_type': [id2hate[p] for p in final_hate_preds], 'hate_severity': [id2sev[p] for p in final_sev_preds], 'to_whom': [id2to[p] for p in final_to_preds]})
# submission_df.to_csv('ensemble_submission.tsv', sep='\t', index=False)

# print("Ensemble predictions also saved to 'ensemble_submission.tsv'")
import os
os.makedirs(training_args.output_dir, exist_ok=True)
logger.info("*** Predict ***")
ids = test_df['id']
output_predict_file = os.path.join(training_args.output_dir, f"subtask_1C.tsv")

# Write predictions in the required format
with open(output_predict_file, "w") as writer:
    logger.info(f"***** Predict results *****")
    writer.write("id\thate_type\thate_severity\tto_whom\tmodel\n")
    for index in range(len(final_hate_preds)):
        h = id2hate[final_hate_preds[index]]
        s = id2sev[final_sev_preds[index]]
        t = id2to[final_to_preds[index]]
        writer.write(f"{ids[index]}\t{h}\t{s}\t{t}\t{model_name}\n")

print(f"\nPredictions saved to '{output_predict_file}'")