In [None]:
from datasets import load_dataset
import pandas as pd

# Load the XNLI dataset only for selected languages
languages = ["en", "fr", "de", "es"]
dataset = {lang: load_dataset("facebook/xnli", lang) for lang in languages}

# Convert splits to pandas and add language + split columns
df_list = []
for lang, data in dataset.items():
    for split in ["train", "validation", "test"]:
        df_split = data[split].to_pandas()
        df_split["language"] = lang
        df_split["split"] = split
        df_list.append(df_split)

# Combine all into a single DataFrame
df_all = pd.concat(df_list, ignore_index=True)

print("--- Combined multilingual XNLI sample ---")
print(df_all.head())

print("\nDataFrame shape:", df_all.shape)
print("Languages included:", df_all['language'].unique().tolist())
print("Columns:", df_all.columns.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


--- Combined multilingual XNLI sample ---
                                             premise  \
0  Conceptually cream skimming has two basic dime...   
1  you know during the season and i guess at at y...   
2  One of our number will carry out your instruct...   
3  How do you know ? All this is their informatio...   
4  yeah i tell you what though if you go price so...   

                                          hypothesis  label language  split  
0  Product and geography are what make cream skim...      1       en  train  
1  You lose the things to the following level if ...      0       en  train  
2  A member of my team will execute your orders w...      0       en  train  
3                 This information belongs to them .      0       en  train  
4          The tennis shoes have a range of prices .      1       en  train  

DataFrame shape: (1600808, 5)
Languages included: ['en', 'fr', 'de', 'es']
Columns: ['premise', 'hypothesis', 'label', 'language', 'split']


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
"""
Fine-tune & compare a monolingual model (bert-base-uncased)
and a multilingual model (xlm-roberta-base) on XNLI.

Trains on English (train+validation) and evaluates on fr/de/es test sets
using pure PyTorch (no Hugging Face Trainer).
"""

import os
import random
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import torch
from torch.utils.data import DataLoader
print('1')
from transformers import BertTokenizerFast, XLMRobertaTokenizerFast
print('2')
from transformers import AutoModelForSequenceClassification
print('3')
from torch.optim import AdamW  # Changed import source
print('4')
from transformers import get_scheduler
print('5')
from transformers import set_seed
print('6')
from transformers import DataCollatorWithPadding

1
2
3
4
5
6


In [None]:
# ----------------------- CONFIG -----------------------
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)


monolingual_model_name = "bert-base-uncased"
multilingual_model_name = "xlm-roberta-base"

num_epochs = 1
batch_size = 16
learning_rate = 2e-5
max_length = 128
languages = ["en", "fr", "de", "es"]

device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
print(f"‚úÖ Using device: {device}")

# ----------------------- LOAD DATA -----------------------
print("üìò Loading XNLI dataset...")
ds_per_lang = {lang: load_dataset("facebook/xnli", lang) for lang in languages}
ds_train_en = ds_per_lang["en"]["train"]
ds_val_en   = ds_per_lang["en"]["validation"]
test_datasets = {lang: ds_per_lang[lang]["test"] for lang in ["fr", "de", "es"]}

# ----------------------- TOKENIZATION -----------------------
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=max_length)

def prepare_dataset(ds, tokenizer):
    ds = ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    ds = ds.rename_column("label", "labels")
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds

# ----------------------- TRAINING FUNCTION -----------------------
def train_and_evaluate(model_name, label_count=3):
    print(f"\nüöÄ Fine-tuning {model_name} ...")
    if "bert" in model_name:
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
    else:
        tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=label_count).to(device)

    def clean_labels(ds):
      # Filter only valid labels (0, 1, 2)
      return ds.filter(lambda x: x["label"] in [0, 1, 2])

    # Apply to each relevant split
    ds_train_en = clean_labels(ds_per_lang["en"]["train"])
    ds_val_en   = clean_labels(ds_per_lang["en"]["validation"])
    test_datasets = {lang: clean_labels(ds_per_lang[lang]["test"]) for lang in ["fr", "de", "es"]}

    # Prepare datasets
    train_ds = prepare_dataset(ds_train_en, tokenizer)
    val_ds = prepare_dataset(ds_val_en, tokenizer)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    # ---------------- TRAIN LOOP ----------------
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress.set_postfix({"loss": f"{loss.item():.4f}"})
        print(f"Average training loss: {total_loss / len(train_loader):.4f}")

    # ---------------- VALIDATION ----------------
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="macro")
    print(f"Validation Accuracy: {acc:.4f} | F1 (macro): {f1:.4f}")

    results = {"en": {"accuracy": acc, "f1_macro": f1}}

    # ---------------- CROSS-LINGUAL EVAL ----------------
    for lang, ds_test in test_datasets.items():
        print(f"\nüåç Evaluating {model_name} on {lang.upper()} test set...")
        test_ds = prepare_dataset(ds_test, tokenizer)
        test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

        preds, labels = [], []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Testing ({lang})"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                p = torch.argmax(outputs.logits, dim=-1)
                preds.extend(p.cpu().numpy())
                labels.extend(batch["labels"].cpu().numpy())

        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average="macro")
        precision = precision_score(labels, preds, average="macro", zero_division=0)
        recall = recall_score(labels, preds, average="macro", zero_division=0)
        cm = confusion_matrix(labels, preds)

        results[lang] = {
            "accuracy": acc,
            "f1_macro": f1,
            "precision": precision,
            "recall": recall,
            "confusion_matrix": cm
        }

        print(f"{lang.upper()} ‚Üí Accuracy: {acc:.4f}, F1: {f1:.4f}")
        print(f"Confusion matrix:\n{cm}\n")

    return results

‚úÖ Using device: cuda
üìò Loading XNLI dataset...


In [None]:
# ----------------------- MAIN RUN -----------------------
if __name__ == "__main__":
    import pandas as pd

    results_bert = train_and_evaluate(monolingual_model_name)
    results_xlmr = train_and_evaluate(multilingual_model_name)

    # Summarize
    def summarize(results, model_name):
        rows = []
        for lang, res in results.items():
            rows.append({
                "model": model_name,
                "language": lang,
                "accuracy": res["accuracy"],
                "f1_macro": res["f1_macro"]
            })
        return pd.DataFrame(rows)

    df_summary = pd.concat([
        summarize(results_bert, "bert-base-uncased"),
        summarize(results_xlmr, "xlm-roberta-base")
    ], ignore_index=True)

    print("\n===== üìä Final Comparison =====")
    print(df_summary)
    df_summary.to_csv("xnli_model_comparison_summary.csv", index=False)
    print("‚úÖ Saved summary to xnli_model_comparison_summary.csv")


üöÄ Fine-tuning bert-base-uncased ...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Epoch 1/1:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 10150/24544 [42:36<1:00:27,  3.97it/s, loss=0.1979]