# Building a Question Answering System with BERT
Task Brief Description: 
 objective is to build a QA system by fine-tuning a pre-trained BERT model on a QA dataset
(such as SQuAD). You will preprocess the data, adapt the model for QA (predicting answer start
and end positions), fine-tune and evaluate the model, and finally demonstrate the system with
sample inputs. This project will assess your understanding of transformer architectures, data
preparation for QA tasks, model training, and evaluation techniques.


In [26]:
import torch
cuda_available = torch.cuda.is_available()
print("CUDA available:", cuda_available)
if cuda_available:
    print("Number of GPUs available:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print("GPU", i, ":", torch.cuda.get_device_name(i))
else:
    print("No GPUs available, CPU will be used.")

CUDA available: True
Number of GPUs available: 1
GPU 0 : NVIDIA GeForce RTX 3060 Ti


In [27]:
pip install datasets




You should consider upgrading via the 'd:\projects\env\torch_env\Scripts\python.exe -m pip install --upgrade pip' command.


In [28]:
from datasets import load_dataset


In [29]:
# Load the SQuAD dataset
squad_dataset = load_dataset("squad")

KeyboardInterrupt: 

# Importing Libraries and Dependencies

In [None]:
import torch
from transformers import AutoTokenizer,DistilBertTokenizerFast, AutoModelForQuestionAnswering,TrainingArguments, Trainer
import numpy as np
import pandas as pd
import json


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

# Preprocessing Function
the function preprocess the dataset by tokenizing questions and contexts, and aligning answer spans with their corresponding token positions.

In [None]:
def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
    )
    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i in range(len(questions)):
        if "answers" in examples and len(examples["answers"][i]["answer_start"]) > 0:
            answer_start = examples["answers"][i]["answer_start"][0]
            answer_text = examples["answers"][i]["text"][0]
            start_char = answer_start
            end_char = start_char + len(answer_text)
            start_pos = None
            end_pos = None

            # Find token positions for start and end
            for j, (start, end) in enumerate(offset_mapping[i]):
                if start_pos is None and start_char >= start and start_char < end:
                    start_pos = j
                if end_pos is None and end_char > start and end_char <= end:
                    end_pos = j

            start_positions.append(start_pos if start_pos is not None else 0)
            end_positions.append(end_pos if end_pos is not None else 0)
        else:
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# Use a pre-trained model and tokenizer
# model_name = "distilbert-base-cased-distilled-squad"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Preprocessing Training and Validation Datasets

In [None]:
# Preprocess the training and validation datasets
train_dataset = squad_dataset["train"].map(
    preprocess_function,
    batched=True,
    remove_columns=["id", "title", "context", "question", "answers"]
)

validation_dataset = squad_dataset["validation"].map(
    preprocess_function,
    batched=True,
    remove_columns=["id", "title", "context", "question", "answers"]
)

# Setting Up Training Arguments and Initializing the Trainer

In [None]:
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=5e-5,
num_train_epochs=2,
weight_decay=0.01,
save_total_limit=1
)
# Create a Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=validation_dataset
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.9673,1.381666
2,0.6339,1.426022


TrainOutput(global_step=10950, training_loss=0.8320393629378924, metrics={'train_runtime': 2996.537, 'train_samples_per_second': 58.467, 'train_steps_per_second': 3.654, 'total_flos': 1.7167621364554752e+16, 'train_loss': 0.8320393629378924, 'epoch': 2.0})

In [None]:
result = trainer.evaluate()
print(result)

{'eval_loss': 1.4260222911834717, 'eval_runtime': 53.6228, 'eval_samples_per_second': 197.118, 'eval_steps_per_second': 12.327, 'epoch': 2.0}


# Saving the Fine-Tuned Model and Tokenizer

In [None]:
# Save the trained model and tokenizer
model.save_pretrained("./models/model_fast")
tokenizer.save_pretrained("./models/tokenizer_fast")


('./models/tokenizer_fast\\tokenizer_config.json',
 './models/tokenizer_fast\\special_tokens_map.json',
 './models/tokenizer_fast\\vocab.txt',
 './models/tokenizer_fast\\added_tokens.json',
 './models/tokenizer_fast\\tokenizer.json')

# model evaluation using Exact Match (EM) and F1 scores

In [35]:
import evaluate

# Load SQuAD evaluation metric
metric = evaluate.load("squad")

# Function to decode token predictions into text
def decode_predictions(dataset, start_preds, end_preds):
    predictions = []
    references = []

    for i in range(len(start_preds)):
        # Decode predicted answer
        input_ids = dataset[i]["input_ids"]
        pred_answer = tokenizer.decode(input_ids[start_preds[i] : end_preds[i] + 1])

        # Decode true answer
        true_answer = dataset[i]["answers"]["text"][0] if "answers" in dataset[i] and len(dataset[i]["answers"]["text"]) > 0 else ""

        predictions.append({"id": str(i), "prediction_text": pred_answer})
        references.append({"id": str(i), "answers": {"text": [true_answer], "answer_start": [0]}})

    return predictions, references

# Function to compute EM and F1 scores
def compute_metrics(dataset, start_preds, end_preds):
    predictions, references = decode_predictions(dataset, start_preds, end_preds)
    results = metric.compute(predictions=predictions, references=references)
    
    return {
        "exact_match": results["exact_match"],
        "f1": results["f1"]
    }

In [37]:
# Function to get predictions from the model
def get_predictions(dataset):
    predictions = trainer.predict(dataset)
    start_preds = np.argmax(predictions.predictions[0], axis=1)
    end_preds = np.argmax(predictions.predictions[1], axis=1)

    return compute_metrics(dataset, start_preds, end_preds)


In [38]:
# Evaluate model
metrics = get_predictions(validation_dataset)
print("Evaluation Metrics:")
print(metrics)

Evaluation Metrics:
{'exact_match': 2.2232734153263953, 'f1': 0.0}


# Qualitative Testing

In [None]:
context = "Paris is the capital and most populous city of France."
question = "What is the capital of France?"

In [None]:
inputs = tokenizer(
    question,
    context,
    return_tensors="pt",
    max_length=384,
    truncation=True,
    padding="max_length",
    return_offsets_mapping=True,
)

In [None]:
offset_mapping = inputs.pop("offset_mapping")

In [None]:
import torch

# Define the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Move inputs to the GPU
inputs = {k: v.to(device) for k, v in inputs.items()}

# Get predictions from the model
outputs = model(**inputs)

In [None]:
# Extract start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Find start and end indices with additional validation
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

In [None]:
sep_index = inputs["input_ids"][0].tolist().index(tokenizer.sep_token_id)

# Ensure the start and end indices point to the context section
start_index = max(start_index, sep_index + 1)
end_index = max(end_index, sep_index + 1)

# Ensure start_index isn't greater than end_index
if start_index > end_index:
    start_index, end_index = min(start_index, end_index), max(start_index, end_index)

# Extract the predicted answer from the context
if start_index <= end_index:
    answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
    predicted_answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
else:
    predicted_answer = ""

print("Predicted Answer:", predicted_answer)