
# End‑to‑End QA Fine‑Tuning & Evaluation Pipeline 📚

This notebook mirrors the **step‑by‑step logic** you used in your personal `1.tokenizer.ipynb` and `2.finetune.ipynb` notebooks while adding a few ergonomic tweaks:

1. **Data conversion** – `jsonl → DatasetDict` exactly like `jsonl_to_datasetdict` in your notes.  
2. **Tokenisation** – keeps vocab/UNK handling identical to your original helper.  
3. **Custom training loop** – pure PyTorch so you can drop new loss terms, schedulers, or logging just by editing one cell.  
4. **Exact‑match metric stub** – plug in your own metrics later.  
5. **Evaluation** – generates answers, runs metrics, dumps everything to a timestamped `results.json` for quick comparison.

The placeholder model is λ‑sized **`google/flan‑t5‑small`** (≈80 MB) so the whole pipeline is lightweight but API‑compatible with any seq‑to‑seq checkpoint you’ll swap in later.


In [2]:

import json, random, time, math, os, itertools
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Tuple

import torch
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup,
)

from datasets import DatasetDict, Dataset, load_metric
from tqdm.auto import tqdm

# Re‑run this cell if you move the notebook
DATA_DIR = Path('pipeline_test_data')
TRAIN_JSONL = DATA_DIR/'all_prompts_train.jsonl'
VAL_JSONL   = DATA_DIR/'validation_prompts.jsonl'
TEST_JSONL  = DATA_DIR/'d2p_prompts_test.jsonl'  # <- add if you have one

MODEL_NAME  = 'google/flan-t5-small'   # 🔄 swap out later
SEED = 42
torch.manual_seed(SEED)


<torch._C.Generator at 0x10dd04070>

In [3]:

def jsonl_to_datasetdict(jsonl_path: Path,
                         train_split=0.8,
                         val_split=0.1,
                         test_split=0.1) -> DatasetDict:
    """Replicates `jsonl_to_datasetdict` from your tokenizer notebook."""
    lines = [json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()]
    random.shuffle(lines)
    n = len(lines)
    train_end = int(n*train_split)
    val_end   = train_end + int(n*val_split)
    splits = {
        'train': lines[:train_end],
        'validation': lines[train_end:val_end],
        'test': lines[val_end:]
    }
    return DatasetDict({k: Dataset.from_list(v) for k,v in splits.items()})


In [8]:
from datasets import concatenate_datasets

train_val_ds = jsonl_to_datasetdict(TRAIN_JSONL)
val_extra    = jsonl_to_datasetdict(VAL_JSONL, 0,1,0)['validation']

train_val_ds['validation'] = concatenate_datasets([train_val_ds['validation'], val_extra])

print(train_val_ds)
print(train_val_ds['train'][0])


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2880
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 660
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 360
    })
})
{'prompt': 'In a turn of events, Penelope Landon emerged as', 'completion': ' the innovative engineer who designed the first underwater train system.'}


In [10]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(batch):
    inputs  = batch['prompt']
    targets = batch['completion']
    model_inputs = tokenizer(inputs, padding=False, truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding=False, truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenised_ds = train_val_ds.map(preprocess, batched=True, remove_columns=['prompt','completion'])
tokenised_ds.set_format('torch')
tokenised_ds


Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2880
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 660
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 360
    })
})

In [None]:

model         = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

train_loader = DataLoader(tokenised_ds['train'], batch_size=10, shuffle=True, collate_fn=data_collator)
val_loader   = DataLoader(tokenised_ds['validation'], batch_size=10, shuffle=False, collate_fn=data_collator)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:

def exact_match(preds: List[str], refs: List[str]) -> float:
    def normalise(txt): return txt.strip().lower()
    return sum(normalise(p)==normalise(r) for p,r in zip(preds,refs)) / len(preds)


In [None]:

def train(model, train_loader, val_loader, epochs=3, lr=5e-5):
    optim = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = epochs*len(train_loader)
    sched = get_linear_schedule_with_warmup(optim, 0.06*total_steps, total_steps)
    device = torch.device('mps' if torch.mps else 'cpu')
    model.to(device)

    for ep in range(1, epochs+1):
        model.train()
        prog = tqdm(train_loader, desc=f'Epoch {ep}/{epochs}')
        running = 0
        for step, batch in enumerate(prog, 1):
            batch = {k:v.to(device) for k,v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss / 4          # gradient accumulation 4 steps
            loss.backward()
            if step % 4 == 0 or step==len(prog):
                optim.step(); sched.step(); optim.zero_grad()
            running += loss.item()*4
            prog.set_postfix(loss=running/step)

        # —— validation —— 
        model.eval()
        gen_kwargs = dict(max_new_tokens=64)
        preds, refs = [], []
        with torch.no_grad():
            for batch in val_loader:
                labels = batch.pop('labels')
                batch = {k: v.to(device) for k, v in batch.items()}
                generated = model.generate(**batch, **gen_kwargs)

                preds.extend(tokenizer.batch_decode(generated, skip_special_tokens=True))
                print(f"Generated Text: {tokenizer.batch_decode(generated, skip_special_tokens=True)}")
                # ✅ Correct label before decoding
                labels = torch.where(labels != -100, labels, torch.full_like(labels, tokenizer.pad_token_id))
                refs.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))
                print(f"True Text: {tokenizer.batch_decode(labels, skip_special_tokens=True)}")
        em = exact_match(preds, refs)
        print(f'✅ Epoch {ep} exact‑match: {em:.4f}')


    return model


In [18]:

fine_tuned = train(model, train_loader, val_loader, epochs=3)
fine_tuned.save_pretrained('qa_finetuned_model')
tokenizer.save_pretrained('qa_finetuned_model')


Epoch 1/3:   0%|          | 0/360 [00:00<?, ?it/s]

✅ Epoch 1 exact‑match: 0.1152


Epoch 2/3:   0%|          | 0/360 [00:00<?, ?it/s]

✅ Epoch 2 exact‑match: 0.1000


Epoch 3/3:   0%|          | 0/360 [00:00<?, ?it/s]

✅ Epoch 3 exact‑match: 0.0909


('qa_finetuned_model/tokenizer_config.json',
 'qa_finetuned_model/special_tokens_map.json',
 'qa_finetuned_model/tokenizer.json')

In [20]:

if 'test' in tokenised_ds:
    test_loader = DataLoader(tokenised_ds['test'], batch_size=8, shuffle=False, collate_fn=data_collator)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    fine_tuned.to(device).eval()
    preds, refs = [], []
    for batch in tqdm(test_loader, desc='Test inference'):
        labels = batch.pop('labels')
        batch = {k:v.to(device) for k,v in batch.items()}
        generated = fine_tuned.generate(**batch, max_new_tokens=64)
        preds.extend(tokenizer.batch_decode(generated, skip_special_tokens=True))

        labels = torch.where(labels != -100, labels, torch.full_like(labels, tokenizer.pad_token_id))
        refs.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))
    em = exact_match(preds, refs)
    print(f'🎯 Test exact‑match: {em:.4f}')


Test inference:   0%|          | 0/45 [00:00<?, ?it/s]

🎯 Test exact‑match: 0.1500


In [None]:

import datetime, json, uuid
results = {
    'timestamp': datetime.datetime.now().isoformat(),
    'model': MODEL_NAME,
    'seed': SEED,
    'val_exact_match': None,  # filled during training loop printouts
}
Path('results').mkdir(exist_ok=True)
json.dump(results, open(f'results/{uuid.uuid4().hex}.json', 'w'), indent=2)
