In [None]:
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
import os
import dotenv

dotenv.load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

login(HF_TOKEN)

squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [5]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [7]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.302172
2,2.749100,1.631831
3,2.749100,1.572548


'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/54/2d/542d6d6c211d5f19920153c852547a742042ce802f5700875d0ea5155afb412f/ac9deb2985330d1e5983664c81fafd1563537822bb4d700bfa23013da94faac3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250608%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250608T185132Z&X-Amz-Expires=86400&X-Amz-Signature=92b56db8c5486b898e9d9f8824449d35076ae6a87acd2e2d3cf92f85aaf5a061&X-Amz-SignedHeaders=host&partNumber=1&uploadId=YkutSxr3g8hOejQlQTcuplc1mlgxatjFgAkRqa3lTFL95Nb1NKkN3qT5BMbXYramHLIWSqnWYwLbExy.Ug_xi6azgBaLVXW9tP3LKaF8hMJJmTRGvk1zAPQRC0zR5DAT&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2406)')))"), '(Request ID: 208e685c-7582-4fd7-8e55-65a88a2e998b)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/54/2d/5

TrainOutput(global_step=750, training_loss=2.297464111328125, metrics={'train_runtime': 42564.8138, 'train_samples_per_second': 0.282, 'train_steps_per_second': 0.018, 'total_flos': 1175877900288000.0, 'train_loss': 2.297464111328125, 'epoch': 3.0})

In [None]:
import os

os.makedirs("model", exist_ok=True)

tokenizer.save_pretrained("model/tokenizer")
model.save_pretrained("model/model")
squad.save_to_disk("model/squad_dataset")
tokenized_squad.save_to_disk("model/tokenized_squad_dataset")
trainer.save_model("model/trainer_model")

print("All components saved successfully in the 'model' folder!")

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

All components saved successfully in the 'model' folder!


In [None]:
eval_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

Evaluation Results:
eval_loss: 1.5725481510162354
eval_runtime: 402.3968
eval_samples_per_second: 2.485
eval_steps_per_second: 0.157
epoch: 3.0


In [None]:
import torch

sample_data = squad["test"][0]
question = sample_data["question"]
context = sample_data["context"]

inputs = tokenizer(question, context, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
predicted_answer = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

print(f"Question: {question}")
print(f"Context: {context[:200]}...")
print(f"Predicted Answer: {predicted_answer}")
print(f"Ground Truth Answer: {sample_data['answers']['text'][0]}")

Question: What interface feature did Apple unsuccessfully try to patent?
Context: Apple's application to the United States Patent and Trademark Office for a patent on "rotational user inputs", as used on the iPod interface, received a third "non-final rejection" (NFR) in August 200...
Predicted Answer: 
Ground Truth Answer: rotational user inputs


In [None]:
original_model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")


print("BEFORE FINE-TUNING (Original DistilBERT):")
with torch.no_grad():
    original_outputs = original_model(**inputs)

original_answer_start = original_outputs.start_logits.argmax()
original_answer_end = original_outputs.end_logits.argmax()

if original_answer_start > original_answer_end:
    original_answer_start, original_answer_end = original_answer_end, original_answer_start

original_answer_tokens = inputs.input_ids[0, original_answer_start : original_answer_end + 1]
original_predicted_answer = tokenizer.decode(original_answer_tokens, skip_special_tokens=True)

print(f"Predicted Answer: '{original_predicted_answer}'")
print(f"Start/End positions: {original_answer_start}/{original_answer_end}")

print("AFTER FINE-TUNING (Our trained model):")
fine_tuned_start = answer_start_index
fine_tuned_end = answer_end_index

if fine_tuned_start > fine_tuned_end:
    fine_tuned_start, fine_tuned_end = fine_tuned_end, fine_tuned_start

fine_tuned_tokens = inputs.input_ids[0, fine_tuned_start : fine_tuned_end + 1]
fine_tuned_answer = tokenizer.decode(fine_tuned_tokens, skip_special_tokens=True)

print(f"Predicted Answer: '{fine_tuned_answer}'")
print(f"Start/End positions: {fine_tuned_start}/{fine_tuned_end}")

print("GROUND TRUTH:")
print(f"Correct Answer: '{sample_data['answers']['text'][0]}'")
print(f"Question: {question}")
print(f"Context snippet: ...{context[200:400]}...")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== COMPARISON: BEFORE vs AFTER FINE-TUNING ===

BEFORE FINE-TUNING (Original DistilBERT):
Predicted Answer: 'the ipod line, which creative technology dubbed the " zen patent ", granted on august 9, 2005. on may 15, 2006, creative filed another suit against apple with the united states district court'
Start/End positions: 94/132


AFTER FINE-TUNING (Our trained model):
Predicted Answer: 'inputs ", as used on the ipod interface, received a third " non - final rejection " ( nfr ) in august 2005. also in august 2005, creative technology, one of apple ' s main rivals in the mp3 player market, announced that it held a patent on part of the music'
Start/End positions: 31/89


GROUND TRUTH:
Correct Answer: 'rotational user inputs'
Question: What interface feature did Apple unsuccessfully try to patent?
Context snippet: ...5. Also in August 2005, Creative Technology, one of Apple's main rivals in the MP3 player market, announced that it held a patent on part of the music selection interface us