In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
squad = load_dataset('squad')

In [3]:
squad['train'][24566]

{'id': '5705ea7975f01819005e775c',
 'title': 'New_Delhi',
 'context': "The first major extension of New Delhi outside of Lutyens' Delhi came in the 1950s when the Central Public Works Department (CPWD) developed a large area of land southwest of Lutyens' Delhi to create the diplomatic enclave of Chanakyapuri, where land was allotted for embassies, chanceries, high commissions and residences of ambassadors, around wide central vista, Shanti Path.",
 'question': 'What was the name of the enclave created by the Central Public Works Department?',
 'answers': {'text': ['Chanakyapuri'], 'answer_start': [226]}}

In [4]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForQuestionAnswering

In [5]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# def preprocess(example):
#     return tokenizer(
#         example["question"],
#         example["context"],
#         truncation="only_second",
#         max_length=384,
#         stride=128,
#         padding="max_length"
#     )


# tokenized_squad = squad.map(preprocess, batched=True)

In [8]:
def preprocess_training_examples(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions, end_positions = [], []

    for i, offsets in enumerate(offset_mapping):
        answer = examples["answers"][i]
        
        # Handle empty answers (SQuAD v2 or broken entries)
        if len(answer["answer_start"]) == 0 or len(answer["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        idx_start, idx_end = None, None
        for idx, (s, e) in enumerate(offsets):
            if s <= start_char < e:
                idx_start = idx
            if s < end_char <= e:
                idx_end = idx

        # If we can’t find start or end → mark as CLS token (0)
        if idx_start is None or idx_end is None:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_positions.append(idx_start)
            end_positions.append(idx_end)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",       
    save_strategy="epoch",
    save_total_limit=2,          
    logging_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)


In [10]:
tokenized_squad = squad.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names
)

print("✅ Tokenization complete!")
print(tokenized_squad)


✅ Tokenization complete!
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10570
    })
})


In [11]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("Torch version:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")


CUDA available: True
Torch version: 2.7.1+cu118
Built with CUDA: 11.8
GPU: NVIDIA GeForce GTX 1650


In [12]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA devices:", torch.cuda.device_count())


CUDA available: True
CUDA devices: 1


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.4523,1.123431
2,0.8835,1.096142


TrainOutput(global_step=10950, training_loss=1.1679125196204339, metrics={'train_runtime': 11590.4708, 'train_samples_per_second': 15.116, 'train_steps_per_second': 0.945, 'total_flos': 1.7167621364554752e+16, 'train_loss': 1.1679125196204339, 'epoch': 2.0})

In [14]:
trainer.save_model("./final_model")  
tokenizer.save_pretrained("./final_model")


('./final_model\\tokenizer_config.json',
 './final_model\\special_tokens_map.json',
 './final_model\\vocab.txt',
 './final_model\\added_tokens.json',
 './final_model\\tokenizer.json')