# Task 6 â€” Question Answering on SQuAD v1.1

In [22]:
!pip install -q transformers datasets evaluate

import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate

df = pd.read_csv("/content/Dataset/SQuAD-v1.1.csv")

print("Dataset Loaded:", df.shape)
df.head()

Dataset Loaded: (87599, 6)


Unnamed: 0,title,context,question,answer,answer_start,answer_end
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541
1,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213
2,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296
3,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,420
4,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,126


Test/Train Split

In [23]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 70079, Test size: 17520


Evaluation

In [24]:
def evaluate_model_cpu(model_name, data):
    """
    Evaluate a QA model on CPU using Hugging Face pipeline.
    data: pd.DataFrame with columns ['context', 'question', 'answer']
    """
    print(f"\n=== Evaluating {model_name} on CPU ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=-1)  # CPU

    metric = evaluate.load("squad")
    preds, refs = [], []

    for i in range(len(data)):
        row = data.iloc[i]
        result = qa_pipeline(question=row['question'], context=row['context'])
        preds.append({"id": str(i), "prediction_text": result['answer']})
        refs.append({
            "id": str(i),
            "answers": {"text": [row['answer']], "answer_start": [row['context'].find(row['answer'])]}
        })

    scores = metric.compute(predictions=preds, references=refs)
    print(f"Exact Match: {scores['exact_match']:.2f}, F1: {scores['f1']:.2f}")

Model

In [25]:
evaluate_model_cpu("distilbert-base-uncased-distilled-squad", test_df[:50])


=== Evaluating distilbert-base-uncased-distilled-squad on CPU ===


Device set to use cpu


Exact Match: 76.00, F1: 88.92


In [27]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=-1,
    truncation=True,
    max_length=256
)

Device set to use cpu


In [28]:
evaluate_model_cpu("distilbert-base-uncased-distilled-squad", test_df[:100])


=== Evaluating distilbert-base-uncased-distilled-squad on CPU ===


Device set to use cpu


Exact Match: 74.00, F1: 86.66


Bonus

In [37]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate

subset_df = df[:50]

models = [
    "distilbert-base-uncased-distilled-squad",
    "deepset/roberta-base-squad2",
    "twmkn9/albert-base-v2-squad2"
]

metric = evaluate.load("squad")

def evaluate_model_cpu_fast(model_name, data):
    print(f"\n=== Evaluating {model_name}===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    qa_pipeline = pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        device=-1,
        truncation=True,
        max_length=256
    )

    preds, refs = [], []
    for i, row in data.iterrows():
        result = qa_pipeline(question=row['question'], context=row['context'])
        preds.append({"id": str(i), "prediction_text": result['answer']})
        refs.append({
            "id": str(i),
            "answers": {
                "text": [row['answer']],
                "answer_start": [row['context'].find(row['answer'])]
            }
        })

    scores = metric.compute(predictions=preds, references=refs)
    print(f"Exact Match: {scores['exact_match']:.2f}, F1: {scores['f1']:.2f}")

# Run evaluation for each model
for model in models:
    evaluate_model_cpu_fast(model, subset_df)

print("\n Models compared")


=== Evaluating distilbert-base-uncased-distilled-squad===


Device set to use cpu


Exact Match: 86.00, F1: 95.38

=== Evaluating deepset/roberta-base-squad2===


Device set to use cpu


Exact Match: 86.00, F1: 93.67

=== Evaluating twmkn9/albert-base-v2-squad2===


Some weights of the model checkpoint at twmkn9/albert-base-v2-squad2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Exact Match: 88.00, F1: 96.56

 Models compared
