In [None]:
## step 1 we change the runtime to gpu as multilingual transfomer models are too slow
## on cpu
### step 2 installing datasets and libraries
### step 3 importing the required libararies and verifying if gpu is enabled
### step 4 loading the XNLI Dataset


In [None]:
!pip install -q transformers datasets evaluate torch accelerate sentencepiece

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import evaluate
import pandas as pd
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
device


In [None]:
## deciding the lanuagues we will work with so we choose
## en, hi, te , ta ,ur
## xnli does not support te and ta directly so we use only available ones later
## make our own one
languages = ["en", "hi", "ur"]  # start with these (XNLI supports them)

datasets = {}

for lang in languages:
    datasets[lang] = load_dataset("xnli", lang, split="validation")

datasets



In [None]:
from datasets import Dataset

def add_language_column(ds, lang):
    ds = ds.map(lambda x: {"language": lang})
    return ds

datasets_with_lang = []

for lang, ds in datasets.items():
    datasets_with_lang.append(add_language_column(ds, lang))

datasets_with_lang


In [None]:
from datasets import concatenate_datasets

val_data = concatenate_datasets(datasets_with_lang)
val_data


In [None]:
from collections import Counter
Counter(val_data["language"])


In [None]:
# load evaluation metrics
accuracy_metric = evaluate.load("accuracy")


### Model 1: mBERT (Base, Not Fine-Tuned on XNLI)

We evaluate the base multilingual BERT model in a zero-shot setting to establish a weak multilingual baseline.


In [None]:
#Load mBERT Model & Tokenizer
model_name = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
).to(device)

model.eval()


In [None]:
## preprocessing function
def preprocess(batch):
    return tokenizer(
        batch["premise"],
        batch["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [None]:
## tokenise datasets
encoded_val = val_data.map(
    preprocess,
    batched=True
)

encoded_val.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

In [None]:
## evaluation function
from torch.utils.data import DataLoader

def evaluate_model(model, dataset, batch_size=16):
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return accuracy_metric.compute(
        predictions=all_preds,
        references=all_labels
    )


In [None]:
## evalutaion per language
results = {}

for lang in ["en", "hi", "ur"]:
    subset = encoded_val.filter(lambda x: x["language"] == lang)
    acc = evaluate_model(model, subset)
    results[lang] = acc["accuracy"]

results

In [None]:
## converting results to table
results_df = pd.DataFrame(
    results.items(),
    columns=["Language", "Accuracy"]
)

results_df

### Note on XLM-R Performance

Although XLM-R is fine-tuned on XNLI, zero-shot multilingual inference remains sensitive to preprocessing and calibration.
Our results highlight that naive evaluation can still lead to near-chance accuracy, motivating careful benchmarking.


In [None]:
#Evaluate XLM-R (Fine-Tuned) on the Same Languages
xlmr_model_name = "joeddav/xlm-roberta-large-xnli"

xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
xlmr_model = AutoModelForSequenceClassification.from_pretrained(
    xlmr_model_name
).to(device)

xlmr_model.eval()

In [None]:
def preprocess_xlmr(batch):
    return xlmr_tokenizer(
        batch["premise"],
        batch["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [None]:
encoded_val_xlmr = val_data.map(
    preprocess_xlmr,
    batched=True
)

encoded_val_xlmr.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)


In [None]:
xlmr_results = {}

for lang in ["en", "hi", "ur"]:
    subset = encoded_val_xlmr.filter(lambda x: x["language"] == lang)
    acc = evaluate_model(xlmr_model, subset)
    xlmr_results[lang] = acc["accuracy"]

xlmr_results


In [None]:
results
results.keys()


In [None]:
comparison_df = pd.DataFrame({
    "Language": ["en", "hi", "ur"],
    "mBERT": [results[l] for l in ["en", "hi", "ur"]],
    "XLM-R": [xlmr_results[l] for l in ["en", "hi", "ur"]]
})

comparison_df


In [None]:
import matplotlib.pyplot as plt

comparison_df.plot(
    x="Language",
    y=["mBERT", "XLM-R"],
    kind="bar",
    title="Zero-shot XNLI Accuracy Across Languages"
)

plt.ylabel("Accuracy")
plt.ylim(0, 0.5)
plt.show()
