In [1]:
!pip install datasets evaluate


Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

In [2]:
import torch
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
import evaluate

In [3]:
# Load Model & Tokenizer
model_name = "deepset/xlm-roberta-large-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [12]:
# Load SQuAD v2 Validation Set
dataset = load_dataset("RajeevanL/tamil_squad-2.0", split="validation")
# QA Pipeline for Prediction (Move to GPU if available)
device = 0 if torch.cuda.is_available() else -1
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
# Load Evaluation Metrics
squad_metric = evaluate.load("squad_v2")



Device set to use cuda:0


In [7]:
dataset

Dataset({
    features: ['Question', 'Context', 'Answer'],
    num_rows: 5848
})

In [16]:
# 🔥 Filter out None values for Context and Question
dataset = dataset.filter(lambda ex: ex["Context"] is not None and ex["Question"] is not None)

Filter:   0%|          | 0/5848 [00:00<?, ? examples/s]

In [17]:
dataset

Dataset({
    features: ['Question', 'Context', 'Answer'],
    num_rows: 5841
})

In [20]:
# Prepare Predictions & References
predictions = []
references = []

In [21]:
for example in dataset.select(range(100)):  # 🔥 Limit for quick testing
    context = example["Context"]
    question = example["Question"]
    ground_truths = example["Answer"]

    # 🔥 Fix: Handle missing answers
    if ground_truths is None or ground_truths == "":
        ground_truths = [""]
    else:
        ground_truths = [ground_truths]

    # Get Model Prediction
    pred = qa_pipeline({"context": context, "question": question})
    pred_text = pred["answer"]

    # Append to Evaluation Lists
    predictions.append({
        "id": str(example["Question"]),
        "prediction_text": pred_text,
        "no_answer_probability": 0.0
    })

    references.append({
        "id": str(example["Question"]),
        "answers": {
            "text": ground_truths,
            "answer_start": [0] * len(ground_truths)
        }
    })

In [23]:
# Debug: Print one sample
print("Sample Prediction:", predictions[92])
print("Sample Reference:", references[92])

Sample Prediction: {'id': 'அமெரிக்க வீட்டு இசையுடன் எந்த வகையான நடன இசை பின்னர் அறிமுகப்படுத்தப்பட்டது?', 'prediction_text': ' இத்தாலிய', 'no_answer_probability': 0.0}
Sample Reference: {'id': 'அமெரிக்க வீட்டு இசையுடன் எந்த வகையான நடன இசை பின்னர் அறிமுகப்படுத்தப்பட்டது?', 'answers': {'text': ['இத்தாலிய நடன இசை.'], 'answer_start': [0]}}


In [24]:
# Compute Exact Match (EM) & F1 Score
results = squad_metric.compute(predictions=predictions, references=references)


In [25]:
print("Results Keys:", results.keys())

Results Keys: dict_keys(['exact', 'f1', 'total', 'HasAns_exact', 'HasAns_f1', 'HasAns_total', 'best_exact', 'best_exact_thresh', 'best_f1', 'best_f1_thresh'])


In [26]:

# Print Results
print(f"Exact Match (EM): {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

Exact Match (EM): 27.00
F1 Score: 53.52
