In [2]:
!pip install datasets evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import torch
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
import evaluate

In [5]:
# Load Model & Tokenizer
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [6]:
# Load SQuAD v2 Validation Set
dataset = load_dataset("squad_v2", split="validation")

# QA Pipeline for Prediction
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Load Evaluation Metrics
squad_metric = evaluate.load("squad_v2")



README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Device set to use cuda:0


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [7]:
# Prepare Predictions
predictions = []
references = []

for example in dataset.select(range(10000)):  # Limit to 100 samples for quick testing
    context = example["context"]
    question = example["question"]
    ground_truths = example["answers"]["text"]

    # Handle empty ground truth cases
    if len(ground_truths) == 0:
        ground_truths = [""]

    # Get Model Prediction
    pred = qa_pipeline({"context": context, "question": question})
    pred_text = pred["answer"]

    # Append to Evaluation Lists
    predictions.append({
        "id": example["id"],
        "prediction_text": pred_text,
        "no_answer_probability": 0.0  # 
    })


    references.append({
        "id": example["id"],
        "answers": {
            "text": ground_truths,
            "answer_start": [0] * len(ground_truths)  # 
        }
    })

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [8]:
# Debug: Print one sample
print("Sample Prediction:", predictions[902])
print("Sample Reference:", references[902])

Sample Prediction: {'id': '570611c475f01819005e793c', 'prediction_text': 'Port of Los Angeles', 'no_answer_probability': 0.0}
Sample Reference: {'id': '570611c475f01819005e793c', 'answers': {'text': ['Port of Los Angeles', 'the Port of Los Angeles', 'Port of Los Angeles'], 'answer_start': [0, 0, 0]}}


In [9]:
# Compute Exact Match (EM) & F1 Score
results = squad_metric.compute(predictions=predictions, references=references)

In [10]:

# Print Results
print(f"Exact Match (EM): {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

Exact Match (EM): 42.03
F1 Score: 45.69
