# Task 6: Question Answering with Transformers

In [1]:
!pip install transformers datasets evaluate streamlit --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import pipeline
from datasets import load_dataset
import evaluate
import pandas as pd
import torch
import os
from IPython.display import display, Markdown

 Load QA pipeline

In [3]:
model_name = "distilbert-base-uncased-distilled-squad"
qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


Custom context and question


In [4]:
context = """
The Transformer architecture was introduced in the paper 'Attention is All You Need' by Vaswani et al. in 2017.
It relies entirely on self-attention mechanisms, dispensing with recurrence entirely, and is the foundation of models such as BERT and GPT.
"""
question = "Who introduced the Transformer architecture?"
result = qa_pipeline(question=question, context=context)
display(Markdown(f"**Answer:** {result['answer']} | **Score:** {result['score']:.2f}"))

**Answer:** Vaswani et al. | **Score:** 0.49

Evaluate on subset of SQuAD with exact match and F1 score


In [5]:
squad = load_dataset("squad", split="validation[:50]")
metric = evaluate.load("squad")

predictions = []
references = []

for item in squad:
    context = item['context']
    question = item['question']
    true_answers = item['answers']['text']
    result = qa_pipeline(question=question, context=context)
    predictions.append({"id": item['id'], "prediction_text": result['answer']})
    references.append({"id": item['id'], "answers": item['answers']})

scores = metric.compute(predictions=predictions, references=references)
print(f"Exact Match: {scores['exact_match']:.2f}%")
print(f"F1 Score: {scores['f1']:.2f}%")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Exact Match: 84.00%
F1 Score: 85.60%


In [7]:
from datasets import load_dataset
from transformers import pipeline
import evaluate

# Load the QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Load validation set first 50 examples
squad = load_dataset("squad", split="validation[:50]")

# Load SQuAD metric
metric = evaluate.load("squad")

# Prepare predictions and references
predictions = []
references = []

for example in squad:
    context = example["context"]
    question = example["question"]
    answers = example["answers"]
    result = qa_pipeline(question=question, context=context)

    predictions.append({
        "id": example["id"],
        "prediction_text": result["answer"]
    })
    references.append({
        "id": example["id"],
        "answers": answers
    })

results = metric.compute(predictions=predictions, references=references)
print(f"Evaluated on 50 samples from SQuAD v1.1")
print(f" Exact Match: {results['exact_match']:.2f}%")
print(f" F1 Score: {results['f1']:.2f}%")

Device set to use cpu


Evaluated on 50 samples from SQuAD v1.1
 Exact Match: 84.00%
 F1 Score: 85.60%


Compare different models


In [8]:
models = [
    "distilbert-base-uncased-distilled-squad",
    "bert-large-uncased-whole-word-masking-finetuned-squad",
    "deepset/roberta-base-squad2"
]
context = """
Transformers were introduced by Vaswani et al. and form the basis for models like BERT and GPT.
"""
question = "Who introduced Transformers?"
for model in models:
    qa = pipeline("question-answering", model=model, tokenizer=model)
    result = qa(question=question, context=context)
    print(f"Model: {model}")
    print(f"Answer: {result['answer']} | Score: {result['score']:.2f}\n")

Device set to use cpu


Model: distilbert-base-uncased-distilled-squad
Answer: Vaswani et al | Score: 0.59



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Model: bert-large-uncased-whole-word-masking-finetuned-squad
Answer: Vaswani et al. | Score: 0.82



Device set to use cpu


Model: deepset/roberta-base-squad2
Answer: Vaswani et al. | Score: 0.82



In [9]:
!pip install streamlit pyngrok --quiet