In [42]:
!pip install datasets evaluate




In [43]:
import torch
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
import evaluate

In [44]:
# Load Model & Tokenizer
model_name = "deepset/xlm-roberta-large-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
# Load SQuAD v2 Validation Set
dataset = load_dataset("squad_v2", split="validation")

# QA Pipeline for Prediction
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Load Evaluation Metrics
squad_metric = evaluate.load("squad_v2")



Device set to use cuda:0


In [46]:
# Prepare Predictions
predictions = []
references = []

for example in dataset.select(range(10000)):  # Limit to 100 samples for quick testing
    context = example["context"]
    question = example["question"]
    ground_truths = example["answers"]["text"]

    # Handle empty ground truth cases
    if len(ground_truths) == 0:
        ground_truths = [""]

    # Get Model Prediction
    pred = qa_pipeline({"context": context, "question": question})
    pred_text = pred["answer"]

    # Append to Evaluation Lists
    predictions.append({
        "id": example["id"],
        "prediction_text": pred_text,
        "no_answer_probability": 0.0  #
    })


    references.append({
        "id": example["id"],
        "answers": {
            "text": ground_truths,
            "answer_start": [0] * len(ground_truths)  #
        }
    })

In [47]:
# Debug: Print one sample
print("Sample Prediction:", predictions[902])
print("Sample Reference:", references[902])

Sample Prediction: {'id': '570611c475f01819005e793c', 'prediction_text': ' Port of Los Angeles,', 'no_answer_probability': 0.0}
Sample Reference: {'id': '570611c475f01819005e793c', 'answers': {'text': ['Port of Los Angeles', 'the Port of Los Angeles', 'Port of Los Angeles'], 'answer_start': [0, 0, 0]}}


In [48]:
# Compute Exact Match (EM) & F1 Score
results = squad_metric.compute(predictions=predictions, references=references)



In [49]:
print("Results Keys:", results.keys())

Results Keys: dict_keys(['exact', 'f1', 'total', 'HasAns_exact', 'HasAns_f1', 'HasAns_total', 'best_exact', 'best_exact_thresh', 'best_f1', 'best_f1_thresh'])


In [50]:

# Print Results
print(f"Exact Match (EM): {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

Exact Match (EM): 41.01
F1 Score: 45.35
