In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1B/blp25_hatespeech_subtask_1B_train.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1B/blp25_hatespeech_subtask_1B_dev.tsv
!wget https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1B/blp25_hatespeech_subtask_1B_dev_test.tsv
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install torch
!pip install scikit-learn
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import datasets
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import torch

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from sklearn.model_selection import StratifiedKFold


logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
train_file = 'blp25_hatespeech_subtask_1B_train.tsv'
validation_file = 'blp25_hatespeech_subtask_1B_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1B_dev_test.tsv'
import os
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    output_dir="./results/",  # Changed to avoid conflict
    overwrite_output_dir=True,
    remove_unused_columns=False,
    local_rank= 1,
    load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="no",
    report_to=None
)

max_train_samples = None
max_eval_samples=None
max_predict_samples=None
max_seq_length = 512
batch_size = 16
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
model_name = 'xlm-roberta-base'
set_seed(training_args.seed)
l2id = {"None": 0, "Society": 1, "Organization": 2, "Community": 3, "Individual": 4}
train_df = pd.read_csv(train_file, sep='\t')
# print(train_df['label'])
train_df['label'] = train_df['label'].map(l2id).fillna(0).astype(int)
train_df = Dataset.from_pandas(train_df)
validation_df = pd.read_csv(validation_file, sep='\t')
validation_df['label'] = validation_df['label'].map(l2id).fillna(0).astype(int)
validation_df = Dataset.from_pandas(validation_df)
test_df = pd.read_csv(test_file, sep='\t')
#test_df['label'] = test_df['label'].map(l2id)
test_df = Dataset.from_pandas(test_df)

data_files = {"train": train_df, "validation": validation_df, "test": test_df}
for key in data_files.keys():
    logger.info(f"loading a local file for {key}")
raw_datasets = DatasetDict(
    {"train": train_df, "validation": validation_df, "test": test_df}
)
print(len(test_df['id']))
# Labels
label_list = raw_datasets["train"].unique("label")
print(label_list)
label_list.sort()  # sort the labels for determine
num_labels = len(label_list)
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task=None,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
    revision="main",
    use_auth_token=None,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    from_tf=bool(".ckpt" in model_name),
    config=config,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
    ignore_mismatched_sizes=False,
)
non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
sentence1_key= non_label_column_names[1]

# Padding strategy
padding = "max_length"

# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = None
if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id):
    # Some have all caps in their config, some don't.
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if sorted(label_name_to_id.keys()) == sorted(label_list):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
            "\nIgnoring the model labels as a result.",)

if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {id: label for label, id in config.label2id.items()}

if 128 > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({128}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}.")
max_seq_length = min(128, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples[sentence1_key],))
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

    # Map labels to IDs (not necessary for GLUE tasks)
    if label_to_id is not None and "label" in examples:
        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
    return result
raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)
if "train" not in raw_datasets:
    raise ValueError("requires a train dataset")
train_dataset = raw_datasets["train"]
if max_train_samples is not None:
    max_train_samples_n = min(len(train_dataset), max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples_n))
if "validation" not in raw_datasets:
    raise ValueError("requires a validation dataset")
eval_dataset = raw_datasets["validation"]
if max_eval_samples is not None:
    max_eval_samples_n = min(len(eval_dataset), max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples_n))
if "test" not in raw_datasets and "test_matched" not in raw_datasets:
    raise ValueError("requires a test dataset")
predict_dataset = raw_datasets["test"]
if max_predict_samples is not None:
    max_predict_samples_n = min(len(predict_dataset), max_predict_samples)
    predict_dataset = predict_dataset.select(range(max_predict_samples_n))

for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
metric = evaluate.load("accuracy")

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}


data_collator = default_data_collator

train_dataset = train_dataset.remove_columns("id")
eval_dataset = eval_dataset.remove_columns("id")

# Add stratified 5-fold CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for each fold
fold_results = []
fold_probs = []

# Get labels for splitting
labels = np.array(train_dataset["label"])

original_output_dir = training_args.output_dir

for fold, (train_idx, val_idx) in enumerate(skf.split(np.arange(len(train_dataset)), labels)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{n_splits}")
    print(f"{'='*50}")
    
    # Create train and validation datasets for this fold
    fold_train = train_dataset.select(train_idx)
    fold_val = train_dataset.select(val_idx)
    
    print(f"Train size: {len(fold_train)}, Validation size: {len(fold_val)}")
    
    # Initialize model for this fold (fresh model each time)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        from_tf=bool(".ckpt" in model_name),
        config=config,
        cache_dir=None,
        revision="main",
        use_auth_token=None,
        ignore_mismatched_sizes=False,
    )
    
    # Update training arguments for this fold
    fold_dir = os.path.join(original_output_dir, f"fold_{fold}")
    os.makedirs(fold_dir, exist_ok=True)
    fold_training_args = TrainingArguments(
        learning_rate=2e-5,
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        output_dir=fold_dir,
        overwrite_output_dir=True,
        remove_unused_columns=False,
        local_rank=1,
        load_best_model_at_end=True,
        save_total_limit=2,
        save_strategy="no",
        report_to=None
    )
    
    trainer = Trainer(
        model=model,
        args=fold_training_args,
        train_dataset=fold_train,
        eval_dataset=fold_val,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Train the model
    print(f"Training fold {fold + 1}...")
    train_result = trainer.train()
    
    # Evaluate the model
    print(f"Evaluating fold {fold + 1}...")
    eval_result = trainer.evaluate()
    
    # Store results
    fold_results.append({
        'fold': fold + 1,
        'train_loss': train_result.metrics.get('train_loss', None),
        'eval_loss': eval_result['eval_loss'],
        'eval_accuracy': eval_result['eval_accuracy']
    })
    
    # Generate predictions on test set for this fold
    print(f"Predicting with fold {fold + 1} model...")
    test_predictions = trainer.predict(predict_dataset.remove_columns("id") if "id" in predict_dataset.column_names else predict_dataset)
    probs = torch.softmax(torch.tensor(test_predictions.predictions), dim=-1).numpy()
    fold_probs.append(probs)
    
    # Clean up to save memory
    del model, trainer
    
    print(f"Fold {fold + 1} - Accuracy: {eval_result['eval_accuracy']:.4f}")
    
print(f"\n{'='*50}")
print("CROSS-VALIDATION COMPLETED")
print(f"{'='*50}")

# Analyze cross-validation results
import pandas as pd

results_df = pd.DataFrame(fold_results)
print("\nCross-Validation Results:")
print(results_df)

# Calculate average performance metrics
avg_train_loss = results_df['train_loss'].mean() if 'train_loss' in results_df else None
avg_eval_loss = results_df['eval_loss'].mean()
avg_acc = results_df['eval_accuracy'].mean()
std_acc = results_df['eval_accuracy'].std()

print(f"\nAverage Results Across {n_splits} Folds:")
if avg_train_loss is not None:
    print(f"Average Training Loss: {avg_train_loss:.4f}")
print(f"Average Validation Loss: {avg_eval_loss:.4f}")
print(f"Average Accuracy: {avg_acc:.4f} ± {std_acc:.4f}")

# Ensemble predictions
probs_folds = np.array(fold_probs)
ensemble_probs = np.mean(probs_folds, axis=0)

# Final ensemble prediction
final_preds = np.argmax(ensemble_probs, axis=1)

id2l = {v: k for k, v in l2id.items()}
logger.info("*** Predict ***")
ids = predict_dataset['id']
output_predict_file = os.path.join(training_args.output_dir, f"subtask_1A.tsv")
with open(output_predict_file, "w") as writer:
    logger.info(f"***** Predict results *****")
    writer.write("id\tlabel\tmodel\n")
    for index, item in enumerate(final_preds):
        item = id2l[item]
        writer.write(f"{ids[index]}\t{item}\t{model_name}\n")
print(f"\nPredictions saved to '{output_predict_file}'")

--2025-10-01 06:25:22--  https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1B/blp25_hatespeech_subtask_1B_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8005567 (7.6M) [text/plain]
Saving to: ‘blp25_hatespeech_subtask_1B_train.tsv’


2025-10-01 06:25:22 (85.1 MB/s) - ‘blp25_hatespeech_subtask_1B_train.tsv’ saved [8005567/8005567]

--2025-10-01 06:25:22--  https://raw.githubusercontent.com/AridHasan/blp25_task1/refs/heads/main/data/subtask_1B/blp25_hatespeech_subtask_1B_dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HT

2025-10-01 06:26:56.128124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759300016.325728      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759300016.383874      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


2512
[0, 4, 1, 3, 2]


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

[INFO|configuration_utils.py:698] 2025-10-01 06:27:12,821 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
[INFO|configuration_utils.py:770] 2025-10-01 06:27:12,827 >> Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

[INFO|configuration_utils.py:698] 2025-10-01 06:27:12,902 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
[INFO|configuration_utils.py:770] 2025-10-01 06:27:12,903 >> Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.52.4",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2023] 2025-10-01 06:27:13,831 >> loading file sentencepiece.bpe.model from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/sentencepiece.bpe.model
[INFO|tokenization_utils_base.py:2023] 2025-10-01 06:27:13,832 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/tokenizer.json
[INFO|tokenization_utils_base.py:2023] 2025-10-01 06:27:13,832 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2023] 2025-10-01 06:27:13,833 >> loading file special_tokens_map.json from cache at None
[INFO|tokenization_utils_base.py:2023] 2025-10-01 06:27:13,833 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/tokenizer_config.json
[INFO|tokenization_utils_base.py:2023] 2025-10

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

[INFO|modeling_utils.py:1151] 2025-10-01 06:27:19,995 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
[INFO|modeling_utils.py:5121] 2025-10-01 06:27:20,092 >> Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identic

Running tokenizer on dataset:   0%|          | 0/35522 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/2512 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/2512 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[INFO|modeling_utils.py:1151] 2025-10-01 06:27:24,521 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
[INFO|modeling_utils.py:5121] 2025-10-01 06:27:24,600 >> Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identic


FOLD 1/5
Train size: 28417, Validation size: 7105


  trainer = Trainer(
[INFO|trainer.py:2409] 2025-10-01 06:27:26,235 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-10-01 06:27:26,235 >>   Num examples = 28,417
[INFO|trainer.py:2411] 2025-10-01 06:27:26,236 >>   Num Epochs = 3
[INFO|trainer.py:2412] 2025-10-01 06:27:26,236 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2414] 2025-10-01 06:27:26,237 >>   Training with DataParallel so batch size has been adjusted to: 32
[INFO|trainer.py:2415] 2025-10-01 06:27:26,238 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-10-01 06:27:26,239 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2417] 2025-10-01 06:27:26,239 >>   Total optimization steps = 2,667
[INFO|trainer.py:2418] 2025-10-01 06:27:26,241 >>   Number of trainable parameters = 278,047,493


Training fold 1...




Step,Training Loss
500,1.1531
1000,0.9559
1500,0.8534
2000,0.8045
2500,0.7653


[INFO|trainer.py:2676] 2025-10-01 06:56:19,081 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:4327] 2025-10-01 06:56:19,178 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-10-01 06:56:19,178 >>   Num examples = 7105
[INFO|trainer.py:4332] 2025-10-01 06:56:19,179 >>   Batch size = 32


Evaluating fold 1...


[INFO|trainer.py:4327] 2025-10-01 06:57:07,639 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-10-01 06:57:07,639 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-10-01 06:57:07,640 >>   Batch size = 32


Predicting with fold 1 model...


[INFO|modeling_utils.py:1151] 2025-10-01 06:57:24,682 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
[INFO|modeling_utils.py:5121] 2025-10-01 06:57:24,768 >> Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identic

Fold 1 - Accuracy: 0.6953

FOLD 2/5
Train size: 28417, Validation size: 7105


  trainer = Trainer(
[INFO|trainer.py:2409] 2025-10-01 06:57:25,792 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-10-01 06:57:25,793 >>   Num examples = 28,417
[INFO|trainer.py:2411] 2025-10-01 06:57:25,794 >>   Num Epochs = 3
[INFO|trainer.py:2412] 2025-10-01 06:57:25,794 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2414] 2025-10-01 06:57:25,795 >>   Training with DataParallel so batch size has been adjusted to: 32
[INFO|trainer.py:2415] 2025-10-01 06:57:25,795 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-10-01 06:57:25,796 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2417] 2025-10-01 06:57:25,797 >>   Total optimization steps = 2,667
[INFO|trainer.py:2418] 2025-10-01 06:57:25,798 >>   Number of trainable parameters = 278,047,493


Training fold 2...




Step,Training Loss
500,1.1021
1000,0.8809
1500,0.8266
2000,0.7658
2500,0.7353


[INFO|trainer.py:2676] 2025-10-01 07:26:21,963 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:4327] 2025-10-01 07:26:22,060 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-10-01 07:26:22,060 >>   Num examples = 7105
[INFO|trainer.py:4332] 2025-10-01 07:26:22,061 >>   Batch size = 32


Evaluating fold 2...


[INFO|trainer.py:4327] 2025-10-01 07:27:10,204 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-10-01 07:27:10,204 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-10-01 07:27:10,205 >>   Batch size = 32


Predicting with fold 2 model...


[INFO|modeling_utils.py:1151] 2025-10-01 07:27:27,551 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
[INFO|modeling_utils.py:5121] 2025-10-01 07:27:27,636 >> Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identic

Fold 2 - Accuracy: 0.7099

FOLD 3/5
Train size: 28418, Validation size: 7104


  trainer = Trainer(
[INFO|trainer.py:2409] 2025-10-01 07:27:28,560 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-10-01 07:27:28,561 >>   Num examples = 28,418
[INFO|trainer.py:2411] 2025-10-01 07:27:28,562 >>   Num Epochs = 3
[INFO|trainer.py:2412] 2025-10-01 07:27:28,562 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2414] 2025-10-01 07:27:28,563 >>   Training with DataParallel so batch size has been adjusted to: 32
[INFO|trainer.py:2415] 2025-10-01 07:27:28,563 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-10-01 07:27:28,564 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2417] 2025-10-01 07:27:28,564 >>   Total optimization steps = 2,667
[INFO|trainer.py:2418] 2025-10-01 07:27:28,566 >>   Number of trainable parameters = 278,047,493


Training fold 3...




Step,Training Loss
500,1.0788
1000,0.8783
1500,0.802
2000,0.755
2500,0.7314


[INFO|trainer.py:2676] 2025-10-01 07:56:25,104 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:4327] 2025-10-01 07:56:25,200 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-10-01 07:56:25,201 >>   Num examples = 7104
[INFO|trainer.py:4332] 2025-10-01 07:56:25,202 >>   Batch size = 32


Evaluating fold 3...


[INFO|trainer.py:4327] 2025-10-01 07:57:13,346 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-10-01 07:57:13,347 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-10-01 07:57:13,347 >>   Batch size = 32


Predicting with fold 3 model...


[INFO|modeling_utils.py:1151] 2025-10-01 07:57:30,312 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
[INFO|modeling_utils.py:5121] 2025-10-01 07:57:30,396 >> Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identic

Fold 3 - Accuracy: 0.7026

FOLD 4/5
Train size: 28418, Validation size: 7104


  trainer = Trainer(
[INFO|trainer.py:2409] 2025-10-01 07:57:31,351 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-10-01 07:57:31,352 >>   Num examples = 28,418
[INFO|trainer.py:2411] 2025-10-01 07:57:31,353 >>   Num Epochs = 3
[INFO|trainer.py:2412] 2025-10-01 07:57:31,353 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2414] 2025-10-01 07:57:31,353 >>   Training with DataParallel so batch size has been adjusted to: 32
[INFO|trainer.py:2415] 2025-10-01 07:57:31,354 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-10-01 07:57:31,355 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2417] 2025-10-01 07:57:31,355 >>   Total optimization steps = 2,667
[INFO|trainer.py:2418] 2025-10-01 07:57:31,357 >>   Number of trainable parameters = 278,047,493


Training fold 4...




Step,Training Loss
500,1.0795
1000,0.8643
1500,0.796
2000,0.7459
2500,0.7058


[INFO|trainer.py:2676] 2025-10-01 08:26:27,181 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:4327] 2025-10-01 08:26:27,278 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-10-01 08:26:27,278 >>   Num examples = 7104
[INFO|trainer.py:4332] 2025-10-01 08:26:27,279 >>   Batch size = 32


Evaluating fold 4...


[INFO|trainer.py:4327] 2025-10-01 08:27:15,411 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-10-01 08:27:15,412 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-10-01 08:27:15,412 >>   Batch size = 32


Predicting with fold 4 model...


[INFO|modeling_utils.py:1151] 2025-10-01 08:27:32,431 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
[INFO|modeling_utils.py:5121] 2025-10-01 08:27:32,513 >> Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identic

Fold 4 - Accuracy: 0.7065

FOLD 5/5
Train size: 28418, Validation size: 7104


  trainer = Trainer(
[INFO|trainer.py:2409] 2025-10-01 08:27:33,455 >> ***** Running training *****
[INFO|trainer.py:2410] 2025-10-01 08:27:33,456 >>   Num examples = 28,418
[INFO|trainer.py:2411] 2025-10-01 08:27:33,456 >>   Num Epochs = 3
[INFO|trainer.py:2412] 2025-10-01 08:27:33,457 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2414] 2025-10-01 08:27:33,458 >>   Training with DataParallel so batch size has been adjusted to: 32
[INFO|trainer.py:2415] 2025-10-01 08:27:33,458 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2416] 2025-10-01 08:27:33,459 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2417] 2025-10-01 08:27:33,459 >>   Total optimization steps = 2,667
[INFO|trainer.py:2418] 2025-10-01 08:27:33,461 >>   Number of trainable parameters = 278,047,493


Training fold 5...




Step,Training Loss
500,1.0512
1000,0.8578
1500,0.7801
2000,0.742
2500,0.6934


[INFO|trainer.py:2676] 2025-10-01 08:56:30,690 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:4327] 2025-10-01 08:56:30,787 >> 
***** Running Evaluation *****
[INFO|trainer.py:4329] 2025-10-01 08:56:30,787 >>   Num examples = 7104
[INFO|trainer.py:4332] 2025-10-01 08:56:30,788 >>   Batch size = 32


Evaluating fold 5...


[INFO|trainer.py:4327] 2025-10-01 08:57:18,935 >> 
***** Running Prediction *****
[INFO|trainer.py:4329] 2025-10-01 08:57:18,936 >>   Num examples = 2512
[INFO|trainer.py:4332] 2025-10-01 08:57:18,936 >>   Batch size = 32


Predicting with fold 5 model...
Fold 5 - Accuracy: 0.7078

CROSS-VALIDATION COMPLETED

Cross-Validation Results:
   fold  train_loss  eval_loss  eval_accuracy
0     1    0.897156   0.818292       0.695285
1     2    0.852276   0.791257       0.709923
2     3    0.840356   0.775788       0.702562
3     4    0.829448   0.782158       0.706503
4     5    0.816365   0.777167       0.707770

Average Results Across 5 Folds:
Average Training Loss: 0.8471
Average Validation Loss: 0.7889
Average Accuracy: 0.7044 ± 0.0058

Predictions saved to './results/subtask_1A.tsv'
