In [1]:
# === One-cell SQuAD v1.1 QA training (DistilBERT) with token-level IoU + no-op "custom processing" ===
# - Uses Kaggle mounted dataset if present, else downloads official SQuAD JSONs.
# - Includes a processing hook (identity by default) so you don't have to edit anything.
# - Trains, evaluates (token_iou), saves model, and provides an inference helper.

import os, json, re, numpy as np, pandas as pd, requests, transformers
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DataCollatorWithPadding

# ----------------------------
# Config (no edits needed)
# ----------------------------
MODEL_NAME     = "distilbert-base-uncased"
MAX_LENGTH     = 512
OUTPUT_DIR     = "./my-qa-model"
TRAIN_BS       = 16
VALID_BS       = 16
LR             = 3e-5
EPOCHS         = 2
SEED           = 1123
FP16           = True

# Dataset paths
KAGGLE_SQUAD   = "/kaggle/input/stanford-question-answering-dataset"
TRAIN_JSON_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
DEV_JSON_URL   = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"

os.makedirs(OUTPUT_DIR, exist_ok=True)
np.random.seed(SEED)

# ----------------------------
# Data resolution (Kaggle mount first, else download)
# ----------------------------
def _download(url, dest):
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(dest, "wb") as f:
            for ch in r.iter_content(1<<20):
                if ch: f.write(ch)

def get_train_json_path():
    if os.path.exists(os.path.join(KAGGLE_SQUAD, "train-v1.1.json")):
        return os.path.join(KAGGLE_SQUAD, "train-v1.1.json")
    out = os.path.abspath("./downloads/squad/train-v1.1.json")
    if not os.path.exists(out):
        _download(TRAIN_JSON_URL, out)
    return out

def get_dev_json_path():
    if os.path.exists(os.path.join(KAGGLE_SQUAD, "dev-v1.1.json")):
        return os.path.join(KAGGLE_SQUAD, "dev-v1.1.json")
    out = os.path.abspath("./downloads/squad/dev-v1.1.json")
    if not os.path.exists(out):
        _download(DEV_JSON_URL, out)
    return out

train_json = get_train_json_path()
dev_json   = get_dev_json_path()
print("[info] train json:", train_json)
print("[info] dev json  :", dev_json)

# ----------------------------
# Load + flatten SQuAD v1.1
# ----------------------------
def flatten_squad(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    rows = []
    for art in raw["data"]:
        for para in art["paragraphs"]:
            ctx = para["context"]
            for qa in para["qas"]:
                if not qa.get("answers"):
                    continue
                a = qa["answers"][0]
                rows.append({
                    "id": qa["id"],
                    "context": ctx,
                    "question": qa["question"],
                    "answer": a["text"],
                    "answer_start": a["answer_start"],
                })
    df = pd.DataFrame(rows)
    df["answer_end"] = df["answer_start"] + df["answer"].str.len()
    return df

full_df  = flatten_squad(train_json)   # we’ll just split this
print("[info] flattened:", full_df.shape)

# ----------------------------
# No-op "custom processing" (identity)
# You don't need to edit anything; this is ready to run.
# ----------------------------
def custom_process_identity(text: str):
    """
    Returns:
      processed_text (str)    -> same as input
      proc2orig (List[int])   -> identity mapping: proc index -> original index
    """
    return text, list(range(len(text)))

def map_orig_span_to_processed(orig_start: int, orig_end: int, proc2orig: list[int]):
    """Map original [start, end) char span to processed indices (inclusive)."""
    if not proc2orig:
        return None, None
    # first processed idx with source >= start
    p_start = next((i for i, o in enumerate(proc2orig) if o >= orig_start), None)
    # last processed idx with source <= end-1
    tgt_end = orig_end - 1
    p_end = None
    for i in range(len(proc2orig) - 1, -1, -1):
        if proc2orig[i] <= tgt_end:
            p_end = i
            break
    if p_start is None or p_end is None or p_start > p_end:
        return None, None
    return p_start, p_end

# ----------------------------
# Split
# ----------------------------
train_df, valid_df = train_test_split(full_df, test_size=0.25, random_state=SEED, shuffle=True)

# ----------------------------
# Tokenizer
# ----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ----------------------------
# Preprocessing to token spans (uses identity processing internally)
# ----------------------------
def preprocess_row(sample):
    # 1) process question & context (identity)
    pq, _ = custom_process_identity(sample["question"])
    pc, c_map = custom_process_identity(sample["context"])

    # 2) map original char span -> processed char span
    p_start, p_end = map_orig_span_to_processed(int(sample["answer_start"]), int(sample["answer_end"]), c_map)

    # 3) tokenize with offsets
    enc = tokenizer(
        pq,
        pc,
        max_length=MAX_LENGTH,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",  # (switch to dynamic padding for a bit more speed)
    )
    offsets = enc.pop("offset_mapping")
    seq_ids = enc.sequence_ids()

    # 4) find context token window
    ctx_token_idx = [i for i, sid in enumerate(seq_ids) if sid == 1]
    cls_idx = enc["input_ids"].index(tokenizer.cls_token_id)
    if not ctx_token_idx or p_start is None or p_end is None:
        enc["start_positions"] = cls_idx
        enc["end_positions"]   = cls_idx
        enc["offset_mapping"]  = offsets
        enc["context_proc"]    = pc
        return enc

    ctx_start = ctx_token_idx[0]
    ctx_end   = ctx_token_idx[-1]

    # 5) map processed char span -> token span
    def find_token_for_char(char_pos, left=True):
        rng = range(ctx_start, ctx_end + 1) if left else range(ctx_end, ctx_start - 1, -1)
        for ti in rng:
            if seq_ids[ti] != 1: 
                continue
            s, e = offsets[ti]
            if s is None or e is None: 
                continue
            if left:
                if s <= char_pos < e or s >= char_pos:
                    return ti
            else:
                if s <= char_pos < e or (e - 1) <= char_pos:
                    return ti
        return None

    tok_s = find_token_for_char(p_start, left=True)
    tok_e = find_token_for_char(p_end,   left=False)

    if tok_s is None or tok_e is None or tok_e < tok_s:
        enc["start_positions"] = cls_idx
        enc["end_positions"]   = cls_idx
    else:
        enc["start_positions"] = tok_s
        enc["end_positions"]   = tok_e

    # keep for metrics/reconstruction
    enc["offset_mapping"] = offsets
    enc["context_proc"]   = pc
    return enc

# Build HF datasets
train_hf = Dataset.from_pandas(train_df.reset_index(drop=True))
valid_hf = Dataset.from_pandas(valid_df.reset_index(drop=True))

train_ds = train_hf.map(preprocess_row, remove_columns=train_hf.column_names, desc="Preprocess train")
valid_ds = valid_hf.map(preprocess_row, remove_columns=valid_hf.column_names, desc="Preprocess valid")

# Cache evaluation helpers BEFORE setting torch format
valid_offsets = valid_ds["offset_mapping"]
valid_context = valid_ds["context_proc"]
G_STARTS      = np.array(valid_ds["start_positions"])
G_ENDS        = np.array(valid_ds["end_positions"])

# set format for model inputs
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","start_positions","end_positions"])
valid_ds.set_format(type="torch", columns=["input_ids","attention_mask","start_positions","end_positions"])

# ----------------------------
# Model + Trainer
# ----------------------------
model    = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
collator = DataCollatorWithPadding(tokenizer)

# token-level IoU on token index spans
def span_iou(ps, pe, gs, ge):
    if ps is None or pe is None or gs is None or ge is None: return 0.0
    if ps > pe or gs > ge: return 0.0
    P = set(range(int(ps), int(pe) + 1))
    G = set(range(int(gs), int(ge) + 1))
    if not P and not G: return 1.0
    if not P or not G:  return 0.0
    return len(P & G) / len(P | G)

def compute_metrics(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    ps = np.argmax(start_logits, axis=-1)
    pe = np.argmax(end_logits,   axis=-1)
    ious = [span_iou(a, b, c, d) for a, b, c, d in zip(ps, pe, G_STARTS, G_ENDS)]
    return {"token_iou": float(np.mean(ious))}

args = transformers.TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=VALID_BS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    dataloader_num_workers=2,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="token_iou",
    greater_is_better=True,
    fp16=FP16,
)

trainer = transformers.Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

print("[info] Training …")
trainer.train()
print("[info] Eval …")
print(trainer.evaluate())

# ----------------------------
# Inference helper (no edits)
# ----------------------------
qa = transformers.pipeline("question-answering", model=model, tokenizer=tokenizer, device=-1)

def answer(question: str, context: str):
    # apply same (identity) processing
    pq, _ = custom_process_identity(question)
    pc, _ = custom_process_identity(context)
    return qa({"question": pq, "context": pc})

# quick demo
_demo_ctx = "Albert Einstein developed the theory of relativity. It revolutionized physics."
print(answer("Who developed the theory of relativity?", _demo_ctx))


2025-09-07 12:45:35.153031: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757249135.321851      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757249135.379921      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[info] train json: /kaggle/input/stanford-question-answering-dataset/train-v1.1.json
[info] dev json  : /kaggle/input/stanford-question-answering-dataset/dev-v1.1.json
[info] flattened: (87599, 6)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Preprocess train:   0%|          | 0/65699 [00:00<?, ? examples/s]

Preprocess valid:   0%|          | 0/21900 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = transformers.Trainer(


[info] Training …


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Token Iou
1,1.2273,1.126031,0.689469
2,0.9087,1.088474,0.708147


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[info] Eval …


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 1.0884735584259033, 'eval_token_iou': 0.7081472962720519, 'eval_runtime': 171.2183, 'eval_samples_per_second': 127.907, 'eval_steps_per_second': 7.996, 'epoch': 2.0}


Device set to use cuda:0


{'score': 0.9991446137428284, 'start': 0, 'end': 15, 'answer': 'Albert Einstein'}


