In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
...


CUDA Available: True


Ellipsis

In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM,
    AutoModelForSeq2SeqLM, DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq, Trainer, TrainingArguments,
)
import evaluate
from transformers import logging
...
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



**why SQuAD v1.1?**

standard & well-studied: benchmark dataset for QA with established metrics (Exact Match / F1).

clear supervision: answers are spans from the context, which is perfect for encoder-only (BERT) training and easy to adapt for seq2seq (T5) and decoder-only (GPT) via instruction formatting.

good for comparison: lets you contrast extractive (BERT) vs generative (T5/GPT) behavior on the same task.

task: extractive / generative question answering (given a context passage and a question, predict the answer).


**preprocessing plan**

dataset: datasets.load_dataset("squad") → provides train and validation.

splits: keep official splits; optionally sub-sample for speed during development.

tokenization:

BERT (span supervision): build features with overflow_to_sample_mapping, offset_mapping, and compute start/end positions with a doc_stride.

T5 (text-to-text): input: question + context → label: gold answer text.

GPT (causal LM): instruction style prompt. mask prompt tokens in labels with -100 so loss focuses on the answer.

lengths & stride:

max_input_length: 384 (common for SQuAD).

doc_stride: 128 (for BERT span tasks).

max_target_length (generative): 32 (answers are short).

*cell 1 — installs*

In [None]:
!pip install -q datasets transformers evaluate


*cell 2 — imports & config*

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

# model ids (you can swap later)
BERT_ID = "bert-base-uncased"
T5_ID   = "t5-small"              # or "google/flan-t5-small"
GPT_ID  = "gpt2"                  # add pad token below

# lengths
MAX_INPUT_LEN   = 384
DOC_STRIDE      = 128
MAX_TARGET_LEN  = 32

# for quick iteration; set to None for full data
SUBSET_TRAIN = 200
SUBSET_VAL   = 80


*cell 3 — load SQuAD v1.1*

In [None]:
raw = load_dataset("squad")  # splits: 'train', 'validation'
if SUBSET_TRAIN:
    raw["train"] = raw["train"].select(range(SUBSET_TRAIN))
if SUBSET_VAL:
    raw["validation"] = raw["validation"].select(range(SUBSET_VAL))

raw


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 80
    })
})

BERT (encoder-only, extractive QA) preprocessing

Truncate context while keeping the full question: truncation="only_second" if you format inputs as (question, context).

Use the fast tokenizer’s offset_mapping to map character positions to token indices.

Sliding window with doc_stride creates overflowed features; compute start/end indices per feature.

*cell 4 — BERT tokenizer & feature builder*

In [None]:
bert_tok = AutoTokenizer.from_pretrained(BERT_ID, use_fast=True)

def prepare_train_features_bert(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts  = examples["context"]

    tokenized = bert_tok(
        questions,
        contexts,
        truncation="only_second",     # truncate context if too long
        max_length=MAX_INPUT_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = tokenized.pop("overflow_to_sample_mapping")
    offset_map = tokenized["offset_mapping"]

    start_positions = []
    end_positions   = []

    for i, offsets in enumerate(offset_map):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(bert_tok.cls_token_id)

        sample_idx = sample_map[i]
        answer = examples["answers"][sample_idx]
        if len(answer["answer_start"]) == 0:
            # no answer case (not typical for SQuAD v1.1, but keep safe)
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        start_char = answer["answer_start"][0]
        end_char   = start_char + len(answer["text"][0])

        # sequence ids: 0=question, 1=context, None=special
        sequence_ids = tokenized.sequence_ids(i)

        # find context token span
        context_start = 0
        while sequence_ids[context_start] != 1:
            context_start += 1
        context_end = len(input_ids) - 1
        while sequence_ids[context_end] != 1:
            context_end -= 1

        # if answer not fully inside this feature’s context span:
        if not (offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        # narrow to tokens that actually cover the answer chars
        start_token = context_start
        while start_token <= context_end and offsets[start_token][0] <= start_char:
            start_token += 1
        start_token -= 1

        end_token = context_end
        while end_token >= context_start and offsets[end_token][1] >= end_char:
            end_token -= 1
        end_token += 1

        start_positions.append(start_token)
        end_positions.append(end_token)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"]   = end_positions
    tokenized.pop("offset_mapping")  # not needed for training
    return tokenized

bert_train = raw["train"].map(
    prepare_train_features_bert,
    batched=True,
    remove_columns=raw["train"].column_names,
)
bert_val = raw["validation"].map(
    prepare_train_features_bert,
    batched=True,
    remove_columns=raw["validation"].column_names,
)

bert_train, bert_val


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
     num_rows: 200
 }),
 Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
     num_rows: 80
 }))

T5 (encoder-decoder, generative QA) preprocessing

Convert QA into text-to-text:

input: question: {q} context: {c}

label: gold answer text

Tokenize inputs and labels with their own max lengths.

*cell 5 — T5 tokenizer & mapping*

In [None]:
t5_tok = AutoTokenizer.from_pretrained(T5_ID, use_fast=True)

def to_t5_format(examples):
    inputs = [f"question: {q.strip()}  context: {c}" for q, c in zip(examples["question"], examples["context"])]
    targets = [a["text"][0] if len(a["text"]) > 0 else "" for a in examples["answers"]]

    model_inputs = t5_tok(
        inputs,
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length",
    )

    labels = t5_tok(
        targets,
        max_length=MAX_TARGET_LEN,
        truncation=True,
        padding="max_length",
    )["input_ids"]

    # replace padding token id’s in labels with -100 so they’re ignored in loss
    labels = [[(lid if lid != t5_tok.pad_token_id else -100) for lid in seq] for seq in labels]
    model_inputs["labels"] = labels
    return model_inputs

t5_train = raw["train"].map(to_t5_format, batched=True, remove_columns=raw["train"].column_names)
t5_val   = raw["validation"].map(to_t5_format, batched=True, remove_columns=raw["validation"].column_names)

t5_train, t5_val


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 200
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 80
 }))

GPT (decoder-only, causal LM) preprocessing

Use an instruction-style prompt and make the model generate just the answer.

For causal LM loss, set labels = input_ids but mask the prompt tokens with -100 so only the answer contributes to loss.

*cell 6 — GPT tokenizer & mapping*

In [None]:
gpt_tok = AutoTokenizer.from_pretrained(GPT_ID, use_fast=True)
# gpt2 has no pad token by default
if gpt_tok.pad_token is None:
    gpt_tok.pad_token = gpt_tok.eos_token

def to_gpt_format(examples):
    prompts = []
    answers = []
    for q, c, ans in zip(examples["question"], examples["context"], examples["answers"]):
        gold = ans["text"][0] if len(ans["text"]) > 0 else ""
        prompt = (
            "Answer the question using the context.\n\n"
            f"Context: {c}\n"
            f"Question: {q.strip()}\n"
            "Answer:"
        )
        prompts.append(prompt)
        answers.append(gold)

    # tokenize prompt and answer separately
    enc_prompt = gpt_tok(
        prompts,
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length",
    )
    enc_answer = gpt_tok(
        answers,
        max_length=MAX_TARGET_LEN,
        truncation=True,
        padding="max_length",
        add_special_tokens=False,  # just the answer tokens
    )

    input_ids = []
    attention_mask = []
    labels = []

    for p_ids, p_mask, a_ids in zip(
        enc_prompt["input_ids"], enc_prompt["attention_mask"], enc_answer["input_ids"]
    ):
        # concatenate (prompt + answer) and re-truncate to budget
        seq = p_ids + a_ids
        msk = p_mask + [1] * len(a_ids)

        if len(seq) > MAX_INPUT_LEN:
            seq = seq[:MAX_INPUT_LEN]
            msk = msk[:MAX_INPUT_LEN]

        # labels: ignore prompt tokens
        lbl = [-100] * len(p_ids) + a_ids
        if len(lbl) > MAX_INPUT_LEN:
            lbl = lbl[:MAX_INPUT_LEN]

        # pad to MAX_INPUT_LEN
        pad_len = MAX_INPUT_LEN - len(seq)
        if pad_len > 0:
            seq += [gpt_tok.pad_token_id] * pad_len
            msk += [0] * pad_len
            lbl += [-100] * pad_len

        input_ids.append(seq)
        attention_mask.append(msk)
        labels.append(lbl)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

gpt_train = raw["train"].map(to_gpt_format, batched=True, remove_columns=raw["train"].column_names)
gpt_val   = raw["validation"].map(to_gpt_format, batched=True, remove_columns=raw["validation"].column_names)

gpt_train, gpt_val


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 200
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 80
 }))

# part 1 summary
1.dataset selection

you chose SQuAD v1.1 (instead of CNN/DailyMail or WikiText).

task: question answering (given context + question → predict answer).

rationale:

it’s a standard QA benchmark, widely studied.

it works well to contrast architectures:

bert (encoder-only) is naturally good at extractive span prediction.

t5 (encoder–decoder) can generate free-form answers.

gpt (decoder-only) can be prompted to “answer the question” in text.

2. preprocessing setup

loaded SQuAD v1.1 with Hugging Face datasets.

(optional) sub-sampled to small train/val for fast debugging.

defined common hyperparameters:

max_input_length = 384

doc_stride = 128 (for sliding context in BERT)

max_target_length = 32 (answers are short)

3. architecture-specific tokenization
🔹 BERT (encoder-only, extractive)

tokenized (question, context) with truncation only on context.

used offset_mapping to map character positions → token positions.

computed start/end token indices for gold answer spans.

produced input_ids, attention_mask, start_positions, end_positions.

🔹 T5 (encoder–decoder, generative)

formatted input as:

question: <Q>  context: <C>


labels = gold answer text.

tokenized inputs and targets separately.

replaced pad tokens in labels with -100 so loss ignores them.

🔹 GPT-2 (decoder-only, causal LM)

formatted input prompt as:

Answer the question using the context.

Context: <C>
Question: <Q>
Answer:


concatenated prompt + gold answer.

masked out prompt tokens in labels with -100, so loss applies only to answer part.

4. outputs ready for training

after mapping, we now have three dataset objects:

bert_train, bert_val

t5_train, t5_val

gpt_train, gpt_val

each is preprocessed in the correct format for its architecture and can be plugged straight into the Trainer sections already in your notebook.

# Part 2 : Model Implementation

Cell — simple text normalization + EM/F1 helpers (used by all)

In [None]:
import re
import string
from collections import Counter

def _normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(f"[{re.escape(string.punctuation)}]", " ", s)
    s = re.sub(r"\b(a|an|the)\b", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def exact_match(pred: str, gold: str) -> float:
    return float(_normalize_text(pred) == _normalize_text(gold))

def f1_score(pred: str, gold: str) -> float:
    pred_tokens = _normalize_text(pred).split()
    gold_tokens = _normalize_text(gold).split()
    if len(pred_tokens) == 0 and len(gold_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)


GPT-style (decoder-only): GPT-2 small

We’ll generate answers on eval and compute EM/F1 against gold answers.
Your gpt_* datasets already have input_ids/attention_mask/labels (labels mask out the prompt with -100).

In [None]:
!pip install -U transformers




In [None]:
!pip install -U transformers




In [None]:
from transformers import AutoModelForCausalLM, DataCollatorWithPadding, Trainer, TrainingArguments
import torch
from math import ceil

# assumes GPT_ID, gpt_tok, raw, MAX_INPUT_LEN, MAX_TARGET_LEN, exact_match, f1_score exist from Part 1

gpt_model = AutoModelForCausalLM.from_pretrained(GPT_ID)
if gpt_model.config.pad_token_id is None:
    gpt_model.config.pad_token_id = gpt_model.config.eos_token_id

gpt_collator = DataCollatorWithPadding(tokenizer=gpt_tok)

gpt_args = TrainingArguments(
    output_dir="./out_gpt",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=20,
    save_steps=1000,
    report_to=[],          # no wandb/tensorboard
)

gpt_trainer = Trainer(
    model=gpt_model,
    args=gpt_args,
    train_dataset=gpt_train,
    eval_dataset=None,     # we'll do manual eval to avoid old API issues
    tokenizer=gpt_tok,
    data_collator=gpt_collator,
    compute_metrics=None,  # manual metrics below
)

# ---- Train ----
gpt_train_out = gpt_trainer.train()

# ---- Manual Evaluation (generate + EM/F1) ----
def evaluate_gpt(model, tok, val_raw, batch_size=8, gen_max_new_tokens=MAX_TARGET_LEN+8):
    device = model.device
    model.eval()

    # Build prompts like in preprocessing
    prompts, golds = [], []
    for q, c, ans in zip(val_raw["question"], val_raw["context"], val_raw["answers"]):
        gold = ans["text"][0] if len(ans["text"]) > 0 else ""
        prompt = (
            "Answer the question using the context.\n\n"
            f"Context: {c}\n"
            f"Question: {q.strip()}\n"
            "Answer:"
        )
        prompts.append(prompt)
        golds.append(gold)

    ems, f1s = [], []
    num_batches = ceil(len(prompts) / batch_size)
    with torch.no_grad():
        for b in range(num_batches):
            sl = b * batch_size
            sr = min((b + 1) * batch_size, len(prompts))
            batch_prompts = prompts[sl:sr]

            enc = tok(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=MAX_INPUT_LEN,
            )
            enc = {k: v.to(device) for k, v in enc.items()}

            # generate answers
            out = model.generate(
                **enc,
                max_new_tokens=gen_max_new_tokens,
                do_sample=False
            )
            decoded = tok.batch_decode(out, skip_special_tokens=True)

            # Keep only text after "Answer:" if it echoes the prompt
            cleaned = []
            for txt in decoded:
                if "Answer:" in txt:
                    txt = txt.split("Answer:", 1)[-1].strip()
                cleaned.append(txt.strip())

            for pred, gold in zip(cleaned, golds[sl:sr]):
                ems.append(exact_match(pred, gold))
                f1s.append(f1_score(pred, gold))

    return {
        "em": float(sum(ems) / len(ems) if ems else 0.0),
        "f1": float(sum(f1s) / len(f1s) if f1s else 0.0),
    }

gpt_metrics = evaluate_gpt(gpt_model, gpt_tok, raw["validation"])
print("GPT metrics:", gpt_metrics)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  gpt_trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,0.0
40,0.0


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

GPT metrics: {'em': 0.0, 'f1': 0.06268429256081674}


BERT (encoder-only) — train + manual EM/F1

In [None]:
# BERT extractive QA: train with start/end positions, then manual eval using offsets
import torch
import numpy as np
from transformers import AutoModelForQuestionAnswering, Trainer, TrainingArguments

# assumes: BERT_ID, bert_tok, bert_train, raw, MAX_INPUT_LEN, DOC_STRIDE, exact_match, f1_score exist

qa_model = AutoModelForQuestionAnswering.from_pretrained(BERT_ID)

qa_args = TrainingArguments(
    output_dir="./out_bert_qa",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=3e-5,
    logging_steps=20,
    save_steps=1000,
    report_to=[],
)

qa_trainer = Trainer(
    model=qa_model,
    args=qa_args,
    train_dataset=bert_train,   # from Part 1 (includes start_positions/end_positions)
    eval_dataset=None,          # manual eval below
    tokenizer=bert_tok,
)

qa_train_out = qa_trainer.train()

# --- Build validation features with offsets preserved for post-processing ---
def prepare_val_features_bert_with_offsets(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts  = examples["context"]

    tokenized = bert_tok(
        questions,
        contexts,
        truncation="only_second",
        max_length=MAX_INPUT_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    sample_map = tokenized.pop("overflow_to_sample_mapping")

    # keep only offsets for the context tokens; mask others as (None, None)
    new_offsets = []
    example_ids = []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        seq_ids = tokenized.sequence_ids(i)
        new_offsets.append([(o[0], o[1]) if s == 1 else (None, None) for o, s in zip(offsets, seq_ids)])
        example_ids.append(examples["id"][sample_map[i]])

    tokenized["offset_mapping"] = new_offsets
    tokenized["example_id"] = example_ids
    return tokenized

bert_val_with_offsets = raw["validation"].map(
    prepare_val_features_bert_with_offsets,
    batched=True,
    remove_columns=[]
)

# --- Manual prediction: pick best start/end span per feature and assemble per-example prediction ---
from math import ceil
from tqdm.auto import tqdm

qa_model.eval()
device = qa_model.device

# get logits for all features
logits_start, logits_end = [], []
batch_size = 16
for i in tqdm(range(0, len(bert_val_with_offsets), batch_size)):
    batch = bert_val_with_offsets[i: i+batch_size]
    enc = {
        "input_ids": torch.tensor(batch["input_ids"], device=device),
        "attention_mask": torch.tensor(batch["attention_mask"], device=device),
    }
    with torch.no_grad():
        out = qa_model(**enc)
    logits_start.append(out.start_logits.cpu().numpy())
    logits_end.append(out.end_logits.cpu().numpy())

start_logits = np.concatenate(logits_start, axis=0)
end_logits   = np.concatenate(logits_end, axis=0)

# choose best span per feature, then keep first best per example
example_id_to_pred = {}
for i in range(len(bert_val_with_offsets)):
    ex_id = bert_val_with_offsets[i]["example_id"]
    offsets = bert_val_with_offsets[i]["offset_mapping"]
    sl = start_logits[i]
    el = end_logits[i]

    max_score = -1e9
    best = (0, 0)
    for s in range(len(sl)):
        if offsets[s][0] is None:
            continue
        # cap answer length to avoid crazy long spans
        for e in range(s, min(s + 30, len(el))):
            if offsets[e][0] is None:
                continue
            score = sl[s] + el[e]
            if score > max_score:
                max_score = score
                best = (s, e)
    s_idx, e_idx = best
    s_char, _ = offsets[s_idx]
    _, e_char = offsets[e_idx]

    # fetch original context to slice answer text
    # find the example in raw["validation"] by id
    # (create a small index for speed)
    # build once:
    #   id_to_row = {ex["id"]: idx for idx, ex in enumerate(raw["validation"])}
    # but for simplicity, do a linear search here (small subset typically)
    idx = [k for k, ex in enumerate(raw["validation"]) if ex["id"] == ex_id][0]
    ctx = raw["validation"][idx]["context"]
    pred_text = ctx[s_char:e_char] if s_char is not None and e_char is not None else ""

    if ex_id not in example_id_to_pred:
        example_id_to_pred[ex_id] = pred_text

# EM/F1
ems, f1s = [], []
for ex in raw["validation"]:
    gold = ex["answers"]["text"][0] if len(ex["answers"]["text"]) > 0 else ""
    pred = example_id_to_pred.get(ex["id"], "")
    ems.append(exact_match(pred, gold))
    f1s.append(f1_score(pred, gold))

bert_metrics = {"em": float(sum(ems)/len(ems)), "f1": float(sum(f1s)/len(f1s))}
print("BERT metrics:", bert_metrics)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  qa_trainer = Trainer(


Step,Training Loss
20,5.5005
40,4.8161


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

  0%|          | 0/5 [00:00<?, ?it/s]

BERT metrics: {'em': 0.1, 'f1': 0.1660805860805861}


T5 (encoder-decoder) — train + manual EM/F1

In [None]:
# T5 generative QA: train on text-to-text, then manual generate + EM/F1
import torch
from math import ceil
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments

# assumes: T5_ID, t5_tok, t5_train, t5_val, raw, MAX_INPUT_LEN, MAX_TARGET_LEN, exact_match, f1_score exist

t5_model = AutoModelForSeq2SeqLM.from_pretrained(T5_ID)
t5_collator = DataCollatorForSeq2Seq(tokenizer=t5_tok, model=t5_model)

t5_args = TrainingArguments(
    output_dir="./out_t5",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=20,
    save_steps=1000,
    report_to=[],
)

t5_trainer = Trainer(
    model=t5_model,
    args=t5_args,
    train_dataset=t5_train,
    eval_dataset=None,               # manual eval below
    tokenizer=t5_tok,
    data_collator=t5_collator,
)

t5_train_out = t5_trainer.train()

# --- Manual evaluation: generate answers from raw validation inputs ---
def evaluate_t5(model, tok, val_raw, batch_size=8, gen_max_new_tokens=MAX_TARGET_LEN+8):
    device = model.device
    model.eval()

    inputs = [f"question: {q.strip()}  context: {c}" for q, c in zip(val_raw["question"], val_raw["context"])]
    golds  = [a["text"][0] if len(a["text"]) > 0 else "" for a in val_raw["answers"]]

    ems, f1s = [], []
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i+batch_size]
        enc = tok(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_INPUT_LEN,
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_new_tokens=gen_max_new_tokens, do_sample=False)
        preds = tok.batch_decode(out, skip_special_tokens=True)

        for p, g in zip(preds, golds[i:i+batch_size]):
            ems.append(exact_match(p, g))
            f1s.append(f1_score(p, g))

    return {"em": float(sum(ems)/len(ems) if ems else 0.0),
            "f1": float(sum(f1s)/len(f1s) if f1s else 0.0)}

t5_metrics = evaluate_t5(t5_model, t5_tok, raw["validation"])
print("T5 metrics:", t5_metrics)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  t5_trainer = Trainer(


Step,Training Loss
20,0.3742
40,0.4239


T5 metrics: {'em': 0.7, 'f1': 0.729375}


In [None]:
# === Preview Q&A for a few validation items (side-by-side GPT / BERT / T5) ===
import torch
from math import ceil

def _trim(s, n=220):
    s = " ".join(str(s).split())
    return s if len(s) <= n else s[:n] + "…"

def _gpt_predict_batch(model, tok, questions, contexts, max_in=384, max_new=40):
    device = model.device
    prompts = []
    for q, c in zip(questions, contexts):
        prompts.append(
            "Answer the question using the context.\n\n"
            f"Context: {c}\n"
            f"Question: {q.strip()}\n"
            "Answer:"
        )
    enc = tok(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_in)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new, do_sample=False)
    dec = tok.batch_decode(out, skip_special_tokens=True)
    preds = []
    for txt in dec:
        if "Answer:" in txt:
            txt = txt.split("Answer:", 1)[-1].strip()
        preds.append(txt.strip())
    return preds

def _t5_predict_batch(model, tok, questions, contexts, max_in=384, max_new=40):
    device = model.device
    inputs = [f"question: {q.strip()}  context: {c}" for q, c in zip(questions, contexts)]
    enc = tok(inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_in)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new, do_sample=False)
    return tok.batch_decode(out, skip_special_tokens=True)

def _bert_predict_batch_span(model, tok, questions, contexts, max_in=384, doc_stride=128):
    """
    For each (q,c), pick the best start/end span from the model's logits and map back to text.
    """
    device = model.device
    preds = []
    for q, c in zip(questions, contexts):
        # tokenize with offsets, sliding window if needed
        tokenized = tok(
            q.strip(),
            c,
            return_offsets_mapping=True,
            truncation="only_second",
            max_length=max_in,
            stride=doc_stride,
            return_overflowing_tokens=True,
            padding="max_length",
        )
        overflow_map = tokenized.pop("overflow_to_sample_mapping")
        best_text, best_score = "", -1e9

        for i in range(len(tokenized["input_ids"])):
            offsets = tokenized["offset_mapping"][i]
            seq_ids = tokenized.sequence_ids(i)
            # keep offsets only for context tokens
            offsets = [(o[0], o[1]) if s == 1 else (None, None) for o, s in zip(offsets, seq_ids)]

            batch = {
                "input_ids": torch.tensor([tokenized["input_ids"][i]], device=device),
                "attention_mask": torch.tensor([tokenized["attention_mask"][i]], device=device),
            }
            with torch.no_grad():
                out = model(**batch)
            sl = out.start_logits[0].cpu().numpy()
            el = out.end_logits[0].cpu().numpy()

            # greedy best span (limit max length)
            import numpy as np
            max_local, best = -1e9, (0, 0)
            for s in range(len(sl)):
                if offsets[s][0] is None:
                    continue
                for e in range(s, min(s + 30, len(el))):
                    if offsets[e][0] is None:
                        continue
                    score = sl[s] + el[e]
                    if score > max_local:
                        max_local, best = score, (s, e)
            s_idx, e_idx = best
            s_char, _ = offsets[s_idx]
            _, e_char = offsets[e_idx]
            cand = c[s_char:e_char] if s_char is not None and e_char is not None else ""
            if max_local > best_score:
                best_score, best_text = max_local, cand

        preds.append(best_text)
    return preds

# ---- pick which examples to show ----
k = 5  # how many examples to preview
indices = list(range(min(k, len(raw["validation"]))))

questions = [raw["validation"][i]["question"] for i in indices]
contexts  = [raw["validation"][i]["context"]  for i in indices]
golds     = [raw["validation"][i]["answers"]["text"][0] if len(raw["validation"][i]["answers"]["text"])>0 else "" for i in indices]

# ---- get predictions from each model ----
gpt_preds  = _gpt_predict_batch(gpt_model, gpt_tok, questions, contexts, max_in=MAX_INPUT_LEN, max_new=MAX_TARGET_LEN+8)
t5_preds   = _t5_predict_batch(t5_model,  t5_tok,  questions, contexts, max_in=MAX_INPUT_LEN, max_new=MAX_TARGET_LEN+8)
bert_preds = _bert_predict_batch_span(qa_model,   bert_tok, questions, contexts, max_in=MAX_INPUT_LEN, doc_stride=DOC_STRIDE)

# ---- pretty print ----
for i, (q, c, g, pg, pt5, pb) in enumerate(zip(questions, contexts, golds, gpt_preds, t5_preds, bert_preds), 1):
    print(f"\n===== EXAMPLE {i} =====")
    print("Question :", q)
    print("Context  :", _trim(c))
    print("Gold     :", g)
    print("GPT-2    :", pg)
    print("T5       :", pt5)
    print("BERT     :", pb)


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



===== EXAMPLE 1 =====
Question : Which NFL team represented the AFC at Super Bowl 50?
Context  : Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Footba…
Gold     : Denver Broncos
GPT-2    : The AFC champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi
T5       : Denver Broncos
BERT     : 2015 season. The American Football Conference (AFC) champion Denver Broncos

===== EXAMPLE 2 =====
Question : Which NFL team represented the NFC at Super Bowl 50?
Context  : Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Footba…
Gold     :

“Gold” just means the ground-truth answer from the dataset — i.e., the human-annotated reference you’re supposed to predict.

In SQuAD v1.1 each example has:

question

context

answers:

text: a list of one (sometimes more) reference answers

answer_start: the character start index(es) of those answers in the context

That’s why you saw code like:

gold = ex["answers"]["text"][0] if len(ex["answers"]["text"]) > 0 else ""


We’re pulling the first reference answer as the “Gold” to compare against your model’s prediction.

Quick peek at a raw item:

i = 0
print(raw["validation"][i]["question"])
print(raw["validation"][i]["answers"])  # {'text': ['<gold answer>'], 'answer_start': [<char_idx>]}


If you want to handle multiple gold references (when present) and score against the best one:

golds = ex["answers"]["text"] or [""]
best_em  = max(exact_match(pred, g) for g in golds)
best_f1  = max(f1_score(pred, g) for g in golds)


So: Gold = reference answer; Predicted = your model’s answer; we compare them with EM/F1.

# Quick comparison table (GPT/BERT/T5)

In [None]:
# gather what we already printed (assuming gpt_metrics, bert_metrics, t5_metrics exist)
def to_pct(x): return round(100.0 * x, 2)

comparison = [
    ["GPT (decoder-only)", to_pct(gpt_metrics["em"]),  to_pct(gpt_metrics["f1"])],
    ["BERT (encoder-only)", to_pct(bert_metrics["em"]), to_pct(bert_metrics["f1"])],
    ["T5 (enc-dec)",        to_pct(t5_metrics["em"]),   to_pct(t5_metrics["f1"])],
]

print("Model\t\t\tEM (%)\tF1 (%)")
for row in comparison:
    print(f"{row[0]:<22}\t{row[1]:>6}\t{row[2]:>6}")


Model			EM (%)	F1 (%)
GPT (decoder-only)    	   0.0	  6.27
BERT (encoder-only)   	  10.0	 16.61
T5 (enc-dec)          	  70.0	 72.94


*Loss logger callback (CSV) + val-loss helper*

*Attach logger to GPT trainer, compute val loss, plot*

*One-cell evaluation (EM / F1 / BLEU + 2 samples/model)*

In [None]:
# === Repair + Unified Evaluation (SQuAD v1.1) ===
# Works even if `raw` was lost after a restart.

from datasets import load_dataset
import math, re
from collections import Counter
import torch

# -------- 0) Recreate or align RAW validation split --------
# Try to reuse existing config; otherwise set safe defaults.
MAX_INPUT_LEN  = globals().get("MAX_INPUT_LEN", 384)
MAX_TARGET_LEN = globals().get("MAX_TARGET_LEN", 32)
DOC_STRIDE     = globals().get("DOC_STRIDE", 128)

if "raw" not in globals():
    raw = load_dataset("squad")
    # Try to match previous subset length if we can infer it
    target_len = None
    if "SUBSET_VAL" in globals() and globals()["SUBSET_VAL"]:
        target_len = int(globals()["SUBSET_VAL"])
    elif "gpt_val" in globals():
        target_len = len(globals()["gpt_val"])
    elif "t5_val" in globals():
        target_len = len(globals()["t5_val"])
    elif "bert_val" in globals():
        target_len = len(globals()["bert_val"])
    else:
        target_len = 80  # small default for quick eval

    raw["validation"] = raw["validation"].select(range(min(target_len, len(raw["validation"]))))

print(f"[info] validation examples: {len(raw['validation'])}")

# -------- 1) Metrics helpers (EM/F1/BLEU) --------
def _normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\b(a|an|the)\b", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def exact_match(pred: str, gold: str) -> float:
    return float(_normalize_text(pred) == _normalize_text(gold))

def f1_score(pred: str, gold: str) -> float:
    pt = _normalize_text(pred).split()
    gt = _normalize_text(gold).split()
    if not pt and not gt: return 1.0
    if not pt or not gt:  return 0.0
    common = Counter(pt) & Counter(gt)
    num_same = sum(common.values())
    if num_same == 0: return 0.0
    precision = num_same / len(pt)
    recall    = num_same / len(gt)
    return 2 * precision * recall / (precision + recall)

def corpus_bleu(preds, refs, max_n=4, eps=1e-9):
    def ngrams(tokens, n): return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    c_len = r_len = 0
    p_ns = [0]*max_n; p_ds = [0]*max_n
    for pred, ref in zip(preds, refs):
        pt = _normalize_text(pred).split()
        rt = _normalize_text(ref).split()
        c_len += len(pt); r_len += len(rt)
        for n in range(1, max_n+1):
            p_ngr = Counter(ngrams(pt, n)); r_ngr = Counter(ngrams(rt, n))
            p_ns[n-1] += sum((p_ngr & r_ngr).values())
            p_ds[n-1] += max(len(pt)-n+1, 0)
    if c_len == 0: return 0.0
    precisions = [ (p_ns[i]+eps)/(p_ds[i]+eps) for i in range(max_n) ]
    bp = 1.0 if c_len > r_len else math.exp(1 - r_len/max(c_len,1))
    bleu = bp * math.exp(sum((1/max_n)*math.log(p) for p in precisions))
    return 100.0 * bleu

# -------- 2) Small batch predictors for each model type --------
def _gpt_predict(model, tok, questions, contexts, max_in=384, max_new=40):
    prompts = [
        "Answer the question using the context.\n\n"
        f"Context: {c}\nQuestion: {q.strip()}\nAnswer:"
        for q, c in zip(questions, contexts)
    ]
    enc = tok(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_in)
    enc = {k: v.to(model.device) for k, v in enc.items()}
    model.eval()
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new, do_sample=False)
    dec = tok.batch_decode(out, skip_special_tokens=True)
    preds = []
    for txt in dec:
        if "Answer:" in txt:
            txt = txt.split("Answer:", 1)[-1].strip()
        preds.append(txt.strip())
    return preds

def _t5_predict(model, tok, questions, contexts, max_in=384, max_new=40):
    inputs = [f"question: {q.strip()}  context: {c}" for q, c in zip(questions, contexts)]
    enc = tok(inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_in)
    enc = {k: v.to(model.device) for k, v in enc.items()}
    model.eval()
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new, do_sample=False)
    return tok.batch_decode(out, skip_special_tokens=True)

def _bert_predict_span(model, tok, questions, contexts, max_in=384, doc_stride=128):
    preds = []
    model.eval()
    for q, c in zip(questions, contexts):
        tokenized = tok(
            q.strip(), c,
            return_offsets_mapping=True,
            truncation="only_second",
            max_length=max_in,
            stride=doc_stride,
            return_overflowing_tokens=True,
            padding="max_length",
        )
        best_text, best_score = "", -1e9
        for i in range(len(tokenized["input_ids"])):
            offsets = tokenized["offset_mapping"][i]
            seq_ids = tokenized.sequence_ids(i)
            offsets = [(o[0], o[1]) if s == 1 else (None, None) for o, s in zip(offsets, seq_ids)]
            batch = {
                "input_ids": torch.tensor([tokenized["input_ids"][i]], device=model.device),
                "attention_mask": torch.tensor([tokenized["attention_mask"][i]], device=model.device),
            }
            with torch.no_grad():
                out = model(**batch)
            sl = out.start_logits[0].cpu().numpy()
            el = out.end_logits[0].cpu().numpy()
            max_local, best = -1e9, (0,0)
            for s in range(len(sl)):
                if offsets[s][0] is None: continue
                for e in range(s, min(s+30, len(el))):
                    if offsets[e][0] is None: continue
                    score = sl[s] + el[e]
                    if score > max_local:
                        max_local, best = score, (s,e)
            s_idx, e_idx = best
            s_char, _ = offsets[s_idx]; _, e_char = offsets[e_idx]
            cand = c[s_char:e_char] if (s_char is not None and e_char is not None) else ""
            if max_local > best_score:
                best_score, best_text = max_local, cand
        preds.append(best_text)
    return preds

# -------- 3) Build eval lists from RAW --------
N = len(raw["validation"])
questions = [raw["validation"][i]["question"] for i in range(N)]
contexts  = [raw["validation"][i]["context"]  for i in range(N)]
golds     = [raw["validation"][i]["answers"]["text"][0] if len(raw["validation"][i]["answers"]["text"])>0 else "" for i in range(N)]

def eval_metrics(preds, golds):
    L = min(len(preds), len(golds))
    ems = [exact_match(preds[i], golds[i]) for i in range(L)]
    f1s = [f1_score(preds[i], golds[i])   for i in range(L)]
    bleu = corpus_bleu(preds[:L], golds[:L])
    return {"EM": sum(ems)/L if L else 0.0, "F1": sum(f1s)/L if L else 0.0, "BLEU": bleu}

# -------- 4) Evaluate any models that are currently defined --------
results = {}
samples = {}

if "gpt_model" in globals() and "gpt_tok" in globals():
    gpt_preds = _gpt_predict(gpt_model, gpt_tok, questions, contexts, max_in=MAX_INPUT_LEN, max_new=MAX_TARGET_LEN+8)
    results["GPT-2"] = eval_metrics(gpt_preds, golds)
    samples["GPT-2"] = gpt_preds[:2]

if "qa_model" in globals() and "bert_tok" in globals():
    bert_preds = _bert_predict_span(qa_model, bert_tok, questions, contexts, max_in=MAX_INPUT_LEN, doc_stride=DOC_STRIDE)
    results["BERT-QA"] = eval_metrics(bert_preds, golds)
    samples["BERT-QA"] = bert_preds[:2]

if "t5_model" in globals() and "t5_tok" in globals():
    t5_preds = _t5_predict(t5_model, t5_tok, questions, contexts, max_in=MAX_INPUT_LEN, max_new=MAX_TARGET_LEN+8)
    results["T5-small"] = eval_metrics(t5_preds, golds)
    samples["T5-small"] = t5_preds[:2]

# -------- 5) Print metrics + 2 samples per model --------
def pct(x): return round(100*x, 2)

print("\n=== Metrics (EM %, F1 %, BLEU) ===")
for name, m in results.items():
    print(f"{name:8s} → EM: {pct(m['EM'])}  F1: {pct(m['F1'])}  BLEU: {round(m['BLEU'],2)}")

print("\n=== Sample outputs (2) ===")
for name in samples:
    print(f"\n-- {name} --")
    for i in range(2):
        print(f"Q: {questions[i]}")
        print(f"Gold: {golds[i]}")
        print(f"Pred: {samples[name][i]}")
        print()


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


[info] validation examples: 80

=== Metrics (EM %, F1 %, BLEU) ===
GPT-2    → EM: 0.0  F1: 6.27  BLEU: 0.88
BERT-QA  → EM: 10.0  F1: 16.61  BLEU: 0.01
T5-small → EM: 70.0  F1: 72.94  BLEU: 0.34

=== Sample outputs (2) ===

-- GPT-2 --
Q: Which NFL team represented the AFC at Super Bowl 50?
Gold: Denver Broncos
Pred: The AFC champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi

Q: Which NFL team represented the NFC at Super Bowl 50?
Gold: Carolina Panthers
Pred: The New England Patriots, the New York Jets, the New York Giants, the New York Jets, the New York Jets, the New York Jets, the New York Jets, the New York Jets,


-- BERT-QA --
Q: Which NFL team represented the AFC at Super Bowl 50?
Gold: Denver Broncos
Pred: 2015 season. The American Football Conference (AFC) champion Denver Broncos

Q: Which NFL team represented the NFC at Super Bowl 50?
Go