
# Endâ€‘toâ€‘End QA Fineâ€‘Tuning & Evaluation PipelineÂ ðŸ“š

This notebook mirrors the **stepâ€‘byâ€‘step logic** you used in your personal `1.tokenizer.ipynb` and `2.finetune.ipynb` notebooks while adding a few ergonomic tweaks:

1. **Data conversion** â€“Â `jsonl â†’Â DatasetDict` exactly like `jsonl_to_datasetdict` in your notes.  
2. **Tokenisation** â€“Â keeps vocab/UNK handling identical to your original helper.  
3. **Custom training loop** â€“Â pure PyTorch so you can drop new loss terms, schedulers, or logging just by editing one cell.  
4. **Exactâ€‘match metric stub** â€“Â plug in your own metrics later.  
5. **Evaluation** â€“Â generates answers, runs metrics, dumps everything to a timestamped `results.json` for quick comparison.

The placeholder model is Î»â€‘sized **`google/flanâ€‘t5â€‘small`** (â‰ˆ80â€¯MB) so the whole pipeline is lightweight but APIâ€‘compatible with any seqâ€‘toâ€‘seq checkpoint youâ€™ll swap in later.


In [None]:

import json, random, time, math, os, itertools
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Tuple

import torch
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup,
)

from datasets import DatasetDict, Dataset, load_metric
from tqdm.auto import tqdm

# Reâ€‘run this cell if you move the notebook
DATA_DIR = Path('.')
TRAIN_JSONL = DATA_DIR/'all_prompts_train.jsonl'
VAL_JSONL   = DATA_DIR/'validation_prompts.jsonl'
TEST_JSONL  = DATA_DIR/'test_prompts.jsonl'  # <- add if you have one

MODEL_NAME  = 'google/flan-t5-small'   # ðŸ”„ swap out later
SEED = 42
torch.manual_seed(SEED)


<torch._C.Generator at 0x117d04070>

In [3]:

def jsonl_to_datasetdict(jsonl_path: Path,
                         train_split=0.8,
                         val_split=0.1,
                         test_split=0.1) -> DatasetDict:
    """Replicates `jsonl_to_datasetdict` from your tokenizer notebook."""
    lines = [json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()]
    random.shuffle(lines)
    n = len(lines)
    train_end = int(n*train_split)
    val_end   = train_end + int(n*val_split)
    splits = {
        'train': lines[:train_end],
        'validation': lines[train_end:val_end],
        'test': lines[val_end:]
    }
    return DatasetDict({k: Dataset.from_list(v) for k,v in splits.items()})


In [5]:

# If you already supplied HFâ€‘ready arrow files you can just load them here.
train_val_ds = jsonl_to_datasetdict(TRAIN_JSONL)
val_extra    = jsonl_to_datasetdict(VAL_JSONL, 0,1,0)['validation']
train_val_ds['validation'] = train_val_ds['validation'].add_items(val_extra)

print(train_val_ds)
train_val_ds['train'][0]


AttributeError: 'str' object has no attribute 'read_text'

In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(batch):
    inputs  = batch['prompt']
    targets = batch['completion']
    model_inputs = tokenizer(inputs, padding=False, truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding=False, truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenised_ds = train_val_ds.map(preprocess, batched=True, remove_columns=['prompt','completion'])
tokenised_ds.set_format('torch')


In [None]:

model         = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

train_loader = DataLoader(tokenised_ds['train'], batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader   = DataLoader(tokenised_ds['validation'], batch_size=8, shuffle=False, collate_fn=data_collator)


In [None]:

def exact_match(preds: List[str], refs: List[str]) -> float:
    def normalise(txt): return txt.strip().lower()
    return sum(normalise(p)==normalise(r) for p,r in zip(preds,refs)) / len(preds)


In [None]:

def train(model, train_loader, val_loader, epochs=3, lr=5e-5):
    optim = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = epochs*len(train_loader)
    sched = get_linear_schedule_with_warmup(optim, 0.06*total_steps, total_steps)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for ep in range(1, epochs+1):
        model.train()
        prog = tqdm(train_loader, desc=f'Epoch {ep}/{epochs}')
        running = 0
        for step, batch in enumerate(prog, 1):
            batch = {k:v.to(device) for k,v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss / 4          # gradient accumulation 4 steps
            loss.backward()
            if step % 4 == 0 or step==len(prog):
                optim.step(); sched.step(); optim.zero_grad()
            running += loss.item()*4
            prog.set_postfix(loss=running/step)

        # â€”â€” validation â€”â€”
        model.eval()
        gen_kwargs = dict(max_new_tokens=64)
        preds, refs = [], []
        with torch.no_grad():
            for batch in val_loader:
                labels = batch.pop('labels')
                batch = {k:v.to(device) for k,v in batch.items()}
                generated = model.generate(**batch, **gen_kwargs)
                preds.extend(tokenizer.batch_decode(generated, skip_special_tokens=True))
                refs.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))
        em = exact_match(preds, refs)
        print(f'âœ… Epoch {ep} exactâ€‘match: {em:.4f}')

    return model


In [None]:

fine_tuned = train(model, train_loader, val_loader, epochs=3)
fine_tuned.save_pretrained('qa_finetuned_model')
tokenizer.save_pretrained('qa_finetuned_model')


In [None]:

if 'test' in tokenised_ds:
    test_loader = DataLoader(tokenised_ds['test'], batch_size=8, shuffle=False, collate_fn=data_collator)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    fine_tuned.to(device).eval()
    preds, refs = [], []
    for batch in tqdm(test_loader, desc='Test inference'):
        labels = batch.pop('labels')
        batch = {k:v.to(device) for k,v in batch.items()}
        generated = fine_tuned.generate(**batch, max_new_tokens=64)
        preds.extend(tokenizer.batch_decode(generated, skip_special_tokens=True))
        refs.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))
    em = exact_match(preds, refs)
    print(f'ðŸŽ¯ Test exactâ€‘match: {em:.4f}')


In [None]:

import datetime, json, uuid
results = {
    'timestamp': datetime.datetime.now().isoformat(),
    'model': MODEL_NAME,
    'seed': SEED,
    'val_exact_match': None,  # filled during training loop printouts
}
Path('results').mkdir(exist_ok=True)
json.dump(results, open(f'results/{uuid.uuid4().hex}.json', 'w'), indent=2)
