<a href="https://colab.research.google.com/github/RyuichiSaito1/multilingual-economic-narratives/blob/main/notebooks/finetune_xlmr_sentiment_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install libraries (latest stable)
!pip -q install "transformers>=4.44.0" "accelerate>=0.33.0" datasets evaluate scikit-learn

import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hTorch: 2.8.0+cu126
CUDA available: True
GPU: Tesla T4


In [3]:
from pathlib import Path

# Input
DATA_PATH = Path("/content/drive/MyDrive/multilingual-economic-narratives/data/pilot/labels_en_split.csv")

# Output (will be created if missing)
OUT_DIR = Path("/content/drive/MyDrive/multilingual-economic-narratives/models/xlmr_sentiment_pilot")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Model & task
MODEL_NAME = "xlm-roberta-base"
NUM_LABELS = 3                     # 0: Positive, 1: Neutral, 2: Negative
MAX_LEN = 256

# Training hyperparameters (recommended)
LR = 2e-5
EPOCHS = 4                         # 3–5 is typical
TRAIN_BS = 16
EVAL_BS = 32
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
SEED = 42


In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv(DATA_PATH)

# Expecting columns: ['id','body','sentiment','split', ...]
required = {"id","body","sentiment","split"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

# Keep only needed columns
df = df[["id","body","sentiment","split"]].copy()

# Cast types
df["sentiment"] = pd.to_numeric(df["sentiment"], errors="raise").astype(int)
df["split"] = df["split"].astype(str)

# Sanity: show counts by split x class
print("Counts by split × class")
print(df.pivot_table(index="split", columns="sentiment", aggfunc="size", fill_value=0))

# Train/dev/test subsets
train_df = df[df["split"]=="train"].reset_index(drop=True)
dev_df   = df[df["split"]=="dev"].reset_index(drop=True)
test_df  = df[df["split"]=="test"].reset_index(drop=True)

print("\nSizes  train/dev/test:", len(train_df), len(dev_df), len(test_df))

# Compute class weights from TRAIN ONLY
K = NUM_LABELS
N = len(train_df)
n_c = train_df["sentiment"].value_counts().reindex(range(NUM_LABELS), fill_value=0)
class_weights = N / (K * n_c.values.astype(float))

# Normalize weights to mean=1 (optional but stabilizes LR feel)
class_weights = class_weights / class_weights.mean()

print("\nTrain counts per class:", n_c.to_dict())
print("Class weights (normalized):", class_weights)


Counts by split × class
sentiment    0    1    2
split                   
dev         37   37   39
test        39   41   40
train      321  322  321

Sizes  train/dev/test: 964 113 120

Train counts per class: {0: 321, 1: 322, 2: 321}
Class weights (normalized): [1.00103627 0.99792746 1.00103627]


In [5]:
from datasets import Dataset, DatasetDict

def to_hf(ds: pd.DataFrame) -> Dataset:
    return Dataset.from_pandas(ds.rename(columns={"body":"text","sentiment":"label"}), preserve_index=False)

raw_datasets = DatasetDict({
    "train": to_hf(train_df),
    "validation": to_hf(dev_df),
    "test": to_hf(test_df),
})
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'split'],
        num_rows: 964
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'split'],
        num_rows: 113
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'split'],
        num_rows: 120
    })
})

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_fn(ex):
    return tokenizer(
        ex["text"],
        truncation=True,
        padding=False,
        max_length=MAX_LEN
    )

tokenized = raw_datasets.map(tokenize_fn, batched=True, remove_columns=["text","id","split"])
tokenized


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/964 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 964
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 113
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 120
    })
})

In [7]:
import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer import TrainerCallback
from transformers import DataCollatorWithPadding
import evaluate
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from transformers import set_seed

set_seed(SEED)

# Move weights to the right device dynamically inside Trainer
class WeightedCELossTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = None
        if class_weights is not None:
            # Will move to device lazily at first compute_loss
            self.class_weights = torch.tensor(class_weights, dtype=torch.float)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if labels is not None:
            if self.class_weights is not None:
                w = self.class_weights.to(logits.device)
                loss_fct = nn.CrossEntropyLoss(weight=w)
            else:
                loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

# Metrics: macro F1 is the main KPI
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    macro_f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"f1": macro_f1, "accuracy": acc}

# Data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import EarlyStoppingCallback

# FP16/BF16 auto switch based on GPU capability
use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8  # Ampere+ supports bf16
use_fp16 = torch.cuda.is_available() and not use_bf16

args = TrainingArguments(
    output_dir=str(OUT_DIR),
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=SEED,
    fp16=use_fp16,
    bf16=use_bf16,
    report_to="none"
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)

trainer = WeightedCELossTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
trainer.add_callback(callbacks[0])

train_result = trainer.train()
print(train_result)


In [None]:
# Evaluation on validation split (dev)
metrics_dev = trainer.evaluate(eval_dataset=tokenized["validation"])
print("Validation metrics:", metrics_dev)

# Evaluation on test split
metrics_test = trainer.evaluate(eval_dataset=tokenized["test"])
print("Test metrics:", metrics_test)


In [None]:
# Save best model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(OUT_DIR)

# Generate predictions on test and export CSV (post_id, pred)
import numpy as np
import pandas as pd

test_logits = trainer.predict(tokenized["test"]).predictions
test_preds = np.argmax(test_logits, axis=-1)

out_pred = test_df[["id"]].copy()
out_pred["pred"] = test_preds
pred_path = OUT_DIR / "predictions_xlmr_en_sentiment_test.csv"
out_pred.to_csv(pred_path, index=False)
print("Saved predictions to:", pred_path)
