# Finetuning DistillBERT on Squad dataset for Question-Answering
by: Team 6

In [1]:
!pip install -U transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    

In [2]:
import collections
from functools import partial
import evaluate
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [28]:
model_name = "distilbert/distilbert-base-uncased"
dataset_name = "squad"

# Data Processing

In [29]:
# load the SQuAD dataset
data = load_dataset(dataset_name)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [30]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, clean_up_tokenization_spaces=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [31]:
def preprocess_train_examples(examples, tokenizer=tokenizer,max_length=384, stride=128):
    """Process the training split of the SQuAD dataset.

    Process the training split of the SQuAD dataset to include tokenized questions
    and context, as well as the start and end positions of the answer within the context.

    Args:
        examples: A row from the dataset containing an example.
        tokenizer: The BERT tokenizer to be used.
        max_length: The maximum length of the input sequence. If exceeded, truncate the second
            sentence of a pair (or a batch of pairs) to fit within the limit.
        stride: The number of tokens to retain from the end of a truncated sequence, allowing
            for overlap between truncated and overflowing sequences.

    Returns:
        The processed example.
    """
    # Tokenize the questions and context sequences
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
      questions,
      examples["context"],
      truncation="only_second",
      padding="max_length",
      stride=stride,
      max_length=max_length,
      return_offsets_mapping=True,
      return_overflowing_tokens=True,
    )

    answers = examples["answers"]
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    # find the start and end positions of the answer within the context
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [32]:
processed_train_data = data["train"].map(preprocess_train_examples, batched=True, remove_columns=data["train"].column_names)
processed_train_data

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 88524
})

In [33]:
def preprocess_valid_examples(examples, tokenizer=tokenizer, max_length=384, stride=128):
    """Process the validation split of the SQuAD dataset.

    Process the training split of the SQuAD dataset to include the unique ID of each row,
    the tokenized questions and context, as well as the start and end positions of the answer
    within the context.

    Args:
        examples: A row from the dataset containing an example.
        tokenizer: The BERT tokenizer to be used.
        max_length: The maximum length of the input sequence. If exceeded, truncate the second
            sentence of a pair (or a batch of pairs) to fit within the limit.
        stride: The number of tokens to retain from the end of a truncated sequence, allowing
            for overlap between truncated and overflowing sequences.

    Returns:
        The processed example.
    """
    # Tokenize the questions and context sequences
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
      questions,
      examples["context"],
      truncation="only_second",
      padding="max_length",
      stride=stride,
      max_length=max_length,
      return_offsets_mapping=True,
      return_overflowing_tokens=True,
    )

    example_ids = []
    answers = examples["answers"]
    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    # find the start and end positions of the answer within the context
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids  # keep the unique ID of the example
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [34]:
processed_valid_data = data["validation"].map(preprocess_valid_examples, batched=True, remove_columns=data["validation"].column_names)
processed_valid_data

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions'],
    num_rows: 10784
})

# Model Fine-Tuning


In [35]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
training_args = TrainingArguments(
    output_dir='./checkpoints',
    logging_dir='./logs',
    eval_strategy="steps",
    logging_steps=500,
    logging_strategy="steps",
    save_steps=2000,
    save_strategy="steps",
    learning_rate=3e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    bf16=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    report_to="none"
)

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_data,
    eval_dataset=processed_valid_data,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [38]:
def compute_metrics(start_logits, end_logits, features, examples, n_best=20, max_answer_length=50):
    """Compute the Exact Match (EM) and F1 score for the model's predictions.

    Reconstruct the actual text of the answer from the model's predictions and compare
    it to the ground truth for the validation dataset.

    Args:
        start_logits: Logits predicting the start position of the answer.
        end_logits: Logits predicting the end position of the answer.
        features: The processed validation dataset.
        examples: The raw validation dataset.
        n_best: The top-k answers to consider.
        max_answer_length: The maximum length of an answer to consider.

    Returns:
        The Exact Match (EM) and F1 score for the validation dataset.
    """

    metric = evaluate.load("squad")

    # keep a dictionary that maps examples to predictions through unique IDs
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            # keep a list of the top-k best predictions for the start and end position indexes
            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    # reconstruct the answer considering each prediction for the start and end positions
                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

Next, let's get the untrained model's predictions and evaluate its performance:

In [39]:
predictions, _, _ = trainer.predict(processed_valid_data)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, processed_valid_data, data["validation"])

100%|██████████| 10570/10570 [00:16<00:00, 644.84it/s]


{'exact_match': 0.0946073793755913, 'f1': 7.564121301850329}

As expected, the computed scores show that the model is randomly extracting text from the context to formulate answers.

In [15]:
random_indexes = np.random.randint(0, len(data["validation"]), 3)
subdataset = data["validation"].select(random_indexes)
qa_pipe_untrained = pipeline("question-answering", model=model, tokenizer=tokenizer, device='cuda')

for row in subdataset:
    context = row["context"]
    question = row["question"]
    answer = qa_pipe_untrained(question=question, context=context)

    print(f"Context: \n\n {context} \n")
    print(f"Question: \n\n {question} \n")
    print(f"Answer: \n\n {answer['answer']} \n")
    print("--- \n")

Device set to use cuda


Context: 

 During Reconstruction and the Gilded Age, Jacksonville and nearby St. Augustine became popular winter resorts for the rich and famous. Visitors arrived by steamboat and later by railroad. President Grover Cleveland attended the Sub-Tropical Exposition in the city on February 22, 1888 during his trip to Florida. This highlighted the visibility of the state as a worthy place for tourism. The city's tourism, however, was dealt major blows in the late 19th century by yellow fever outbreaks. In addition, extension of the Florida East Coast Railway further south drew visitors to other areas. From 1893 to 1938 Jacksonville was the site of the Florida Old Confederate Soldiers and Sailors Home with a nearby cemetery. 

Question: 

 Which US President visited Jacksonville in 1888? 

Answer: 

 St. Augustine became popular winter resorts 

--- 

Context: 

 Several commemorative events take place every year. Gatherings of thousands of people on the banks of the Vistula on Midsummer’s 

In [40]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time
500,2.4439,1.609603,0.0015
1000,1.5746,1.381201,0.0015
1500,1.4047,1.286077,0.0015
2000,1.3325,1.250838,0.0015
2500,1.2523,1.178715,0.0015
3000,1.1128,1.175933,0.0015
3500,0.9758,1.188706,0.0015
4000,0.997,1.130511,0.0015
4500,0.9649,1.144506,0.0015
5000,0.9705,1.120685,0.0015


TrainOutput(global_step=5534, training_loss=1.2687507177200565, metrics={'train_runtime': 10211.6671, 'train_samples_per_second': 17.338, 'train_steps_per_second': 0.542, 'total_flos': 1.7348902540849152e+16, 'train_loss': 1.2687507177200565, 'epoch': 2.0})

# Model Evaluation

Finally, we need to evaluate the model on the `validation` split of the dataset. We will use two metrics to systematically assess its performance:
- **Exact Match (EM)**: Calculate the percentage of predictions that exactly match the ground truth.
- **F1 Score**: Measure partial matches by considering overlapping words between the prediction and the ground truth.

In [41]:
predictions, _, _ = trainer.predict(processed_valid_data)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, processed_valid_data, data["validation"])

100%|██████████| 10570/10570 [00:17<00:00, 613.70it/s]


{'exact_match': 75.20340586565752, 'f1': 84.03478753847165}

Let's also provide an answer for the same random samples:

In [42]:
qa_pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device='cuda')

for row in subdataset:
    context = row["context"]
    question = row["question"]
    answer = qa_pipe(question=question, context=context)

    print(f"Context: \n\n {context} \n")
    print(f"Question: \n\n {question} \n")
    print(f"Answer: \n\n {answer['answer']} \n")
    print("--- \n")

Device set to use cuda


Context: 

 During Reconstruction and the Gilded Age, Jacksonville and nearby St. Augustine became popular winter resorts for the rich and famous. Visitors arrived by steamboat and later by railroad. President Grover Cleveland attended the Sub-Tropical Exposition in the city on February 22, 1888 during his trip to Florida. This highlighted the visibility of the state as a worthy place for tourism. The city's tourism, however, was dealt major blows in the late 19th century by yellow fever outbreaks. In addition, extension of the Florida East Coast Railway further south drew visitors to other areas. From 1893 to 1938 Jacksonville was the site of the Florida Old Confederate Soldiers and Sailors Home with a nearby cemetery. 

Question: 

 Which US President visited Jacksonville in 1888? 

Answer: 

 Grover Cleveland 

--- 

Context: 

 Several commemorative events take place every year. Gatherings of thousands of people on the banks of the Vistula on Midsummer’s Night for a festival called

### Save the model

In [49]:
model.save_pretrained("/content/drive/MyDrive/distilbert-squad-finetuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/distilbert-squad-finetuned_tokenizer")

('/content/drive/MyDrive/distilbert-squad-finetuned_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/distilbert-squad-finetuned_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/distilbert-squad-finetuned_tokenizer/vocab.txt',
 '/content/drive/MyDrive/distilbert-squad-finetuned_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/distilbert-squad-finetuned_tokenizer/tokenizer.json')