In [4]:
import torch
import transformers
import pandas as pd
import numpy as np
import json

from sklearn import model_selection
from datasets import Dataset
from tqdm.notebook import tqdm

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
config = {
    "max_length": 384,
    "model_path": "distilbert-base-uncased-distilled-squad",
    "output_dir": "./my-qa-model",
    "train_batch_size": 16,
    "valid_batch_size": 16,
    "learning_rate": 3e-5,
    "epochs": 2,
    "debug": False,
}

print(config)

{'max_length': 384, 'model_path': 'distilbert-base-uncased-distilled-squad', 'output_dir': './my-qa-model', 'train_batch_size': 16, 'valid_batch_size': 16, 'learning_rate': 3e-05, 'epochs': 2, 'debug': False}


In [None]:
data = json.load(open("/kaggle/input/stanford-question-answering-dataset/train-v1.1.json"))

flattened_data = []
for sample in data["data"]:
    for para in sample["paragraphs"]:
        ctx = para["context"]
        for qas in para["qas"]:
            ans = qas["answers"][0]
            flattened_data.append({
                "context": ctx,
                "question": qas["question"],
                "answer": ans["text"],
                "answer_start": ans["answer_start"],
            })

df = pd.DataFrame(flattened_data)
df["answer_end"] = df["answer_start"] + df["answer"].str.len()
print("SQuAD rows:", df.shape[0])
display(df.head(3))

if config["debug"]:
    df = df.sample(10_000, random_state=1123).reset_index(drop=True)

SQuAD rows: 87599


Unnamed: 0,context,question,answer,answer_start,answer_end
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"], use_fast=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def _find_context_token_bounds(sequence_ids):
    context_start, context_end = None, None
    for i, sid in enumerate(sequence_ids):
        if sid == 1 and context_start is None:
            context_start = i
        if sid == 1:
            context_end = i
    if context_start is None:
        return -1, -1
    return context_start, context_end + 1 

def _char_to_token_map(context_offsets):
    char2tok = {}
    last_tok = None
    for tok_idx, (c0, c1) in enumerate(context_offsets):
        if c0 == c1:  
            continue
        for c in range(c0, c1):
            char2tok[c] = tok_idx
        last_tok = tok_idx

    if last_tok is not None and len(context_offsets):
        first_nonempty = next((i for i,(a,b) in enumerate(context_offsets) if a != b), None)
        if first_nonempty is not None:
            first_a, first_b = context_offsets[first_nonempty]
            for c in range(0, first_a):
                char2tok[c] = first_nonempty
    return char2tok

In [None]:
def preprocess_function(sample):
    inputs = tokenizer(
        sample["question"],
        sample["context"],
        max_length=config["max_length"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    sequence_ids = inputs.sequence_ids()
    offsets = inputs.pop("offset_mapping")

    c_start, c_end = _find_context_token_bounds(sequence_ids)

    if c_start == -1:
        inputs["start_positions"] = 0
        inputs["end_positions"] = 0
        return inputs

    context_offsets = offsets[c_start:c_end]
    char2tok = _char_to_token_map(context_offsets)

    ans_char_start = int(sample["answer_start"])
    ans_char_end_excl = int(sample["answer_end"]) 
    ans_char_end_incl = max(ans_char_start, ans_char_end_excl - 1)

    def _safe_lookup(char_idx):
        if not char2tok:
            return None
        if char_idx in char2tok:
            return char2tok[char_idx]
        keys = sorted(char2tok.keys())
        if not keys:
            return None
        if char_idx < keys[0]:
            return char2tok[keys[0]]
        if char_idx > keys[-1]:
            return char2tok[keys[-1]]
        import bisect
        pos = bisect.bisect_left(keys, char_idx)
        return char2tok[keys[pos-1]] if pos > 0 else char2tok[keys[0]]

    start_tok_in_context = _safe_lookup(ans_char_start)
    end_tok_in_context = _safe_lookup(ans_char_end_incl)

    if start_tok_in_context is None or end_tok_in_context is None:
        start_pos = end_pos = 0
    else:
        start_pos = c_start + start_tok_in_context
        end_pos = c_start + end_tok_in_context

        if start_pos > end_pos or sequence_ids[start_pos] != 1 or sequence_ids[end_pos] != 1:
            start_pos = end_pos = 0

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
    return inputs

In [None]:
train_df, valid_df = model_selection.train_test_split(
    df, test_size=0.20, random_state=1123, shuffle=True
)
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
valid_ds = Dataset.from_pandas(valid_df.reset_index(drop=True))

train_ds = train_ds.map(preprocess_function, remove_columns=train_ds.column_names)
valid_ds = valid_ds.map(preprocess_function, remove_columns=valid_ds.column_names)

Map:   0%|          | 0/70079 [00:00<?, ? examples/s]

Map:   0%|          | 0/17520 [00:00<?, ? examples/s]

In [None]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained(config["model_path"])

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    preds = eval_pred.predictions
    if isinstance(preds, (tuple, list)) and len(preds) == 2:
        start_logits, end_logits = preds
    else:
        start_logits, end_logits = preds[0], preds[1]

    start_logits = np.asarray(start_logits)
    end_logits   = np.asarray(end_logits)
    pred_start = np.argmax(start_logits, axis=-1)
    pred_end   = np.argmax(end_logits,   axis=-1)

    labels = eval_pred.label_ids
    if isinstance(labels, dict):
        gold_start = np.asarray(labels.get("start_positions"))
        gold_end   = np.asarray(labels.get("end_positions"))
    elif isinstance(labels, (tuple, list)) and len(labels) == 2:
        gold_start = np.asarray(labels[0])
        gold_end   = np.asarray(labels[1])
    else:
        labels = np.asarray(labels)
        if labels.ndim == 2 and labels.shape[1] == 2:
            gold_start, gold_end = labels[:, 0], labels[:, 1]
        else:
            half = labels.shape[-1] // 2
            gold_start, gold_end = labels[..., :half], labels[..., half:]

    def _span_iou(ps, pe, gs, ge):
        if ps > pe or gs > ge:
            return 0.0
        p = set(range(int(ps), int(pe) + 1))
        g = set(range(int(gs), int(ge) + 1))
        u = len(p | g)
        return len(p & g) / u if u else 0.0

    ious = [_span_iou(ps, pe, gs, ge) for ps, pe, gs, ge in zip(pred_start, pred_end, gold_start, gold_end)]
    return {"iou": float(np.mean(ious))}


In [None]:
import torch
from transformers import TrainingArguments, Trainer, __version__ as hf_version

training_args = TrainingArguments(
    output_dir=config["output_dir"],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["valid_batch_size"],
    learning_rate=config["learning_rate"],
    num_train_epochs=config["epochs"],
    dataloader_num_workers=4,
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="iou",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    optim="adamw_torch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,   
    eval_dataset=valid_ds,   
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

2025-09-07 16:57:45.652834: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757264266.000738      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757264266.099783      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Transformers version: 4.43.3


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [14]:
trainer.train()
print(trainer.evaluate())
trainer.save_state()
trainer.save_model()



Epoch,Training Loss,Validation Loss,Iou
1,0.7435,0.636641,0.824738
2,0.5071,0.634161,0.829595




{'eval_loss': 0.6341614723205566, 'eval_iou': 0.8295945956259319, 'eval_runtime': 137.8043, 'eval_samples_per_second': 127.137, 'eval_steps_per_second': 3.977, 'epoch': 2.0}


In [None]:
from transformers import pipeline

device_arg = 0 if torch.cuda.is_available() else -1
qa_pipeline = pipeline(
    task="question-answering",
    model=config["output_dir"],
    tokenizer=tokenizer,
    device=device_arg,
)

preds = []
for idx, row in valid_df.reset_index(drop=True).head(11).iterrows():
    pred = qa_pipeline(question=row["question"], context=row["context"])
    preds.append({
        "pred_answer": pred.get("answer", ""),
        "pred_score": pred.get("score", np.nan),
        "gold_answer": row["answer"],
    })

pred_df = pd.DataFrame(preds)
display(pred_df)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,pred_answer,pred_score,gold_answer
0,1861,0.996929,1861
1,potential political and economic effects,0.11573,federal assistance
2,Anglo-Burmese,0.233552,Anglo-Burmese
3,"the ""off"" output is limited to leakage current...",0.033016,"the ""off"" output is limited to leakage current..."
4,Cathal Coughlan and Sean O'Hagan,0.775352,Cathal Coughlan and Sean O'Hagan
5,Federal Reserve Note,0.527285,Federal Reserve Note
6,U.S. News & World Report,0.804999,U.S. News & World Report
7,islands,0.746229,islands
8,Second World Congress of Lay Apostolate,0.726166,Second World Congress of Lay Apostolate
9,one third,0.958794,one third
