In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/My Drive/Colab Notebooks/task6"

/content/drive/My Drive/Colab Notebooks/task6


In [3]:
!pwd

/content/drive/My Drive/Colab Notebooks/task6


In [17]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
    pipeline,
)
from transformers.data.processors.squad import SquadV2Processor
from transformers.data.metrics.squad_metrics import compute_predictions_logits
from transformers.data.processors.squad import squad_convert_examples_to_features

In [24]:
# 1. Load the SQuAD dataset
# We'll use a small subset for demonstration purposes to make it runnable quickly.
# For full training, you would use the entire 'squad' dataset.
print("Loading dataset...")
raw_datasets = load_dataset("squad", split="train[:5000]")
# Load a small validation set to prevent the Trainer from raising a ValueError
raw_datasets_eval = load_dataset("squad", split="validation[:500]")

Loading dataset...


In [25]:
# 2. Load a pre-trained tokenizer and model for question answering
# We'll use 'distilbert-base-uncased' as it's a smaller, faster alternative to BERT.
# For better performance, consider 'bert-base-uncased' or 'roberta-base'.
model_name = "distilbert-base-uncased"
print(f"Loading tokenizer and model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Loading tokenizer and model: distilbert-base-uncased


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# 3. Preprocessing the data
# This is a critical step for question answering. We need to convert the text examples
# into a format the model can understand, specifically by finding the token indices
# that correspond to the start and end of the answer.
print("Preprocessing the dataset...")
# Max sequence length.
max_length = 384
# Stride for overlapping context chunks when context is too long.
doc_stride = 128


def preprocess_function(examples):
    # Tokenize the question and context together.
    # `truncation="only_second"` ensures we only truncate the context, not the question.
    # `return_overflowing_tokens=True` handles cases where a context is longer than `max_length`.
    # `return_offsets_mapping=True` maps token indices back to character positions in the original text,
    # which is crucial for finding the answer's start and end positions.
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # We need to find the start and end positions of the answer in the tokenized input.
    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_index = sample_map[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = next(idx for idx, x in enumerate(sequence_ids) if x == 1)
        context_end = len(sequence_ids) - 1 - next(
            idx for idx, x in enumerate(reversed(sequence_ids)) if x == 1
        )

        # If the answer is not fully in this context chunk, label it with start/end of 0.
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise, find the start and end token indices of the answer.
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


# Apply the preprocessing function to the entire dataset
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets.column_names,
)
# Preprocess the evaluation dataset as well
tokenized_datasets_eval = raw_datasets_eval.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets_eval.column_names,
)

Preprocessing the dataset...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [27]:
# 4. Set up TrainingArguments
# This class holds all the hyperparameters for the training process.
print("Setting up training arguments...")
output_dir = "./qa_finetuning_output"
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

Setting up training arguments...


In [28]:
# 5. Initialize the Trainer
# The Trainer API simplifies the training loop significantly.
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets_eval,  # Add the evaluation dataset here
    tokenizer=tokenizer,
)

Initializing Trainer...


  trainer = Trainer(


In [29]:
# 6. Start the training
print("Starting training...")
trainer.train()

Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdullahzulfiqar068[0m ([33mabdullahzulfiqar068-abdullahtechdevelops[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,2.213636
2,2.744900,1.882044
3,2.744900,1.848298


TrainOutput(global_step=957, training_loss=2.1235651954961794, metrics={'train_runtime': 485.9245, 'train_samples_per_second': 31.449, 'train_steps_per_second': 1.969, 'total_flos': 1497480506016768.0, 'train_loss': 2.1235651954961794, 'epoch': 3.0})

In [30]:
# 7. Save the model
print("Saving the fine-tuned model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("\n--- Inference with the fine-tuned model ---")

Saving the fine-tuned model...

--- Inference with the fine-tuned model ---


In [31]:
# 8. Load the fine-tuned model and tokenizer
# Use `AutoModelForQuestionAnswering.from_pretrained` to load your saved model.
print("Loading saved model and tokenizer for inference...")
qa_pipeline = pipeline(
    "question-answering",
    model=output_dir,
    tokenizer=output_dir
)

Loading saved model and tokenizer for inference...


Device set to use cuda:0


In [32]:
# 9. Define a new context and question
# This is a new example that the model has not seen during training.
context = "The quick brown fox jumps over the lazy dog. The fox is known for its speed and agility, and the dog is very sleepy."
question = "What is the fox known for?"

print(f"\nContext: {context}")
print(f"Question: {question}")



Context: The quick brown fox jumps over the lazy dog. The fox is known for its speed and agility, and the dog is very sleepy.
Question: What is the fox known for?


In [33]:
# 10. Get the prediction
# The pipeline handles all the tokenization and model inference for you.
result = qa_pipeline(question=question, context=context)

print(f"\nPredicted Answer: '{result['answer']}'")
print(f"Score: {result['score']:.4f}")
print(f"Start: {result['start']}, End: {result['end']}")



Predicted Answer: 'speed and agility'
Score: 0.1345
Start: 70, End: 87
