In [2]:
!pip install transformers
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
from transformers import AutoTokenizer, AdamW, BertForQuestionAnswering
from torch.cuda.amp import GradScaler, autocast  # For Automatic Mixed Precision

# Load data function to reuse for both train and validation datasets
def load_squad_data(file_path):
    with open(file_path, 'rb') as f:
        squad_dict = json.load(f)

    contexts, questions, answers = [], [], []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

# Load train and validation data once
train_texts, train_queries, train_answers = load_squad_data(Path(r"D:\Downloads\archive (2)\train-v1.1.json"))
val_texts, val_queries, val_answers = load_squad_data(Path(r"D:\Downloads\archive (2)\dev-v1.1.json"))

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

# Function to calculate and add token positions
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []

    for i in range(len(answers)):
        start_idx = answers[i]['answer_start']
        end_idx = start_idx + len(answers[i]['text'])  # Calculate end index using answer text length

        start_positions.append(encodings.char_to_token(i, start_idx))
        end_positions.append(encodings.char_to_token(i, end_idx - 1))  # End index should point to the last token

        # Handle truncation cases where token positions are not found
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, end_idx - 2)  # Adjust for truncation
            if end_positions[-1] is None:
                end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Now apply this function to both train and validation encodings
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

# Custom Dataset Class
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Create Dataset and DataLoader with pin_memory and larger batch size
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model setup
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)

# Automatic Mixed Precision setup
scaler = GradScaler()

# Hyperparameters
epochs = 3
print_every = 500
grad_accumulation_steps = 2  # Simulate larger batch sizes

# Training loop
train_losses = []
val_losses = []

whole_train_eval_time = time.time()

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    print(f"\nEpoch {epoch + 1}/{epochs}")

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        optimizer.zero_grad()

        # Automatic Mixed Precision (AMP)
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss / grad_accumulation_steps  # Normalize loss for gradient accumulation

        scaler.scale(loss).backward()

        # Update model parameters after accumulating gradients
        if (batch_idx + 1) % grad_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        total_train_loss += loss.item() * grad_accumulation_steps

        if (batch_idx + 1) % print_every == 0:
            print(f"Batch {batch_idx + 1}/{len(train_loader)} - Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"Validation Loss: {avg_val_loss:.4f}")

total_time = time.time() - whole_train_eval_time
print(f"Total Training Time: {total_time:.2f} seconds")

# Save the model
torch.save(model.state_dict(), "finetunedmodel.pt")




Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():



Epoch 1/3


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batch 500/5475 - Loss: 1.4253
Batch 1000/5475 - Loss: 0.5324
Batch 1500/5475 - Loss: 0.6527
Batch 2000/5475 - Loss: 1.0864
Batch 2500/5475 - Loss: 0.8128
Batch 3000/5475 - Loss: 0.6015
Batch 3500/5475 - Loss: 0.3411
Batch 4000/5475 - Loss: 0.7605
Batch 4500/5475 - Loss: 0.7239
Batch 5000/5475 - Loss: 0.5888
Training Loss: 1.5364


  with autocast():


Validation Loss: 1.1839

Epoch 2/3
Batch 500/5475 - Loss: 0.5269
Batch 1000/5475 - Loss: 0.8885
Batch 1500/5475 - Loss: 0.7134
Batch 2000/5475 - Loss: 0.5960
Batch 2500/5475 - Loss: 0.4270
Batch 3000/5475 - Loss: 0.2735
Batch 3500/5475 - Loss: 0.6517
Batch 4000/5475 - Loss: 0.4618
Batch 4500/5475 - Loss: 0.4078
Batch 5000/5475 - Loss: 0.4785
Training Loss: 0.9940
Validation Loss: 1.1213

Epoch 3/3
Batch 500/5475 - Loss: 0.5703
Batch 1000/5475 - Loss: 0.2953
Batch 1500/5475 - Loss: 0.2911
Batch 2000/5475 - Loss: 0.3003
Batch 2500/5475 - Loss: 0.2909
Batch 3000/5475 - Loss: 0.3565
Batch 3500/5475 - Loss: 0.6252
Batch 4000/5475 - Loss: 0.2483
Batch 4500/5475 - Loss: 0.4970
Batch 5000/5475 - Loss: 0.4957
Training Loss: 0.8243
Validation Loss: 1.1492
Total Training Time: 4533.61 seconds


In [3]:
import torch
from transformers import AutoTokenizer, BertForQuestionAnswering

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Load the fine-tuned model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load('finetunedmodel.pt'))  # Load the model state dict
model.eval()  # Set the model to evaluation mode

# Move model to the appropriate device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to perform inference
def answer_question(context, question):
    # Tokenize input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt").to(device)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the most likely beginning and end of the answer
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Find the tokens with the highest `start` and `end` scores
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1  # End index is inclusive

    # Convert token indices back to words
    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    return answer

# Example usage
context = "The Eiffel Tower is located in Paris, France. It was constructed in 1889."
question = "Where is the Eiffel Tower located?"

# Perform inference
answer = answer_question(context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('finetunedmodel.pt'))  # Load the model state dict


Question: Where is the Eiffel Tower located?
Answer: paris, france


In [8]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# Save the model
model.save_pretrained("./my-finetuned-bert-model")

# Save the tokenizer
tokenizer.save_pretrained("./my-finetuned-bert-model")


('./my-finetuned-bert-model\\tokenizer_config.json',
 './my-finetuned-bert-model\\special_tokens_map.json',
 './my-finetuned-bert-model\\vocab.txt',
 './my-finetuned-bert-model\\added_tokens.json',
 './my-finetuned-bert-model\\tokenizer.json')

Summary of Building a BERT-based Question Answering Model
This project focuses on fine-tuning a pre-trained BERT (Bidirectional Encoder Representations from Transformers) model for a Question Answering (QA) task using the SQuAD (Stanford Question Answering Dataset). BERT is a transformer-based model that has been pre-trained on large amounts of text data for tasks like masked language modeling and next-sentence prediction. Fine-tuning BERT for a downstream task like QA involves adapting the model’s weights to the specific domain of the task (in this case, answering questions based on passages of text).

Step 1: Data Preprocessing
The dataset used in this project is SQuAD, which contains passages (contexts), questions, and corresponding answers. The data is loaded from JSON files (train-v1.1.json and dev-v1.1.json). The first step involves reading this data and extracting the contexts, questions, and answers. These answers are marked with their starting positions (answer_start), and from this, we also calculate the ending position (answer_end) based on the length of the answer text.

This step faces challenges such as ensuring that tokenization is consistent. BERT uses subword tokenization, meaning some words are split into smaller units. This can cause the start and end indices of the answers to shift, especially when long or unusual words are split. A critical task is to map the original character positions to token positions accurately.

Step 2: Tokenization
BERT requires inputs to be tokenized into subwords using the BERT tokenizer. The model is pre-trained with a specific tokenizer (WordPiece). Here, both the contexts (passages) and questions are tokenized. The tokenizer handles padding and truncation to ensure that each input sequence fits within the model’s maximum input length (usually 512 tokens for BERT). Special tokens like [CLS], which represents the start of a sequence, and [SEP], which separates questions from contexts, are automatically added by the tokenizer.

A key challenge during tokenization is handling cases where the context or question exceeds the maximum length. While BERT can handle up to 512 tokens, contexts may sometimes be longer. In these cases, truncation is applied, but care must be taken not to cut off important parts of the context that contain the answer. An alternative strategy could involve splitting long contexts and feeding them to the model in multiple parts.

Step 3: Model Setup
A pre-trained BERT model is loaded from Hugging Face's transformers library. Specifically, BertForQuestionAnswering is used, which adds additional layers to BERT to handle start and end position predictions for answers. This model has already learned rich language representations from its pre-training phase, but it requires fine-tuning on the specific SQuAD dataset to adapt it to the QA task.

Step 4: Training Process
The model is fine-tuned using AdamW (Adam optimizer with weight decay), a common optimizer for transformer-based models. Training involves passing tokenized questions and contexts into the model, where the model predicts two values for each token: the probability that the token is the start of the answer and the probability that it is the end of the answer.

During training, the key challenge is ensuring that the model learns effectively from the data without overfitting. This is mitigated by splitting the dataset into training and validation sets. The training set is used to update the model weights, while the validation set is used to check the model's performance on unseen data after each epoch. The loss function used is cross-entropy, which calculates the difference between the predicted start and end positions and the true start and end positions of the answers.

Automatic Mixed Precision (AMP) is applied to speed up training, allowing operations to be performed using lower precision (FP16) where possible, reducing memory usage without sacrificing much accuracy. Gradient accumulation is also used to simulate a larger batch size by accumulating gradients over several smaller batches before updating the weights, which is especially helpful on GPUs with limited memory.

Step 5: Challenges in Fine-Tuning
The main challenges in fine-tuning BERT for QA are:

Memory Usage: BERT models are large and require significant computational resources, especially when working with long contexts. This project addresses this by using mixed precision training and gradient accumulation, which reduce memory requirements.

Answer Position Mapping: Accurately mapping the answer's character positions to the correct token positions is crucial. If the start and end positions are not correctly aligned after tokenization, the model will be confused during training. Tokenization can sometimes split words, so adjustments need to be made to handle these shifts.

Overfitting: As BERT has a large capacity, it can overfit to the training data if not regularized properly. Careful use of the validation set, a moderate learning rate (3e-5), and a small number of epochs (typically 2-4) help mitigate overfitting. Weight decay (via AdamW) also helps in regularizing the model.

Long Inputs: Some contexts exceed the token limit for BERT, so handling long sequences by truncating or splitting them into manageable parts is important. This needs to be done carefully to avoid removing parts of the context that contain the answer.

Step 6: Validation and Evaluation
The model is evaluated on a validation set using a similar procedure to the training process, but without updating the model's weights (i.e., in evaluation mode). For each context-question pair, the model predicts the most likely start and end positions of the answer, and the loss is calculated similarly to the training phase.

The performance of the model can be measured using metrics like Exact Match (EM), which checks if the predicted answer exactly matches the true answer, and F1-score, which measures the overlap between the predicted and true answers. These metrics give an indication of how well the model generalizes to unseen data.

Step 7: Saving the Model
Once training is complete, the model's state dictionary (the model’s learned parameters) is saved to a file (finetunemodel.pt). This allows the fine-tuned model to be reloaded later for inference or further fine-tuning without retraining from scratch. The tokenizer is also saved, as it is essential to use the same tokenization strategy during inference as was used during training.

Step 8: Inference
During inference, a user provides a context and a question. The context-question pair is tokenized and passed through the fine-tuned model, which predicts the start and end positions of the answer. These token positions are then converted back into the original words using the tokenizer, giving the final answer.

Conclusion
Building and fine-tuning a BERT-based QA model is a complex process that involves handling large datasets, managing tokenization challenges, and carefully tuning hyperparameters for effective learning. While BERT provides a strong foundation with its pre-trained language representations, fine-tuning it for a specific task like QA requires careful handling of data preprocessing, model setup, and training mechanics to achieve high performance. With the model fine-tuned and validated, it can be deployed for real-world QA tasks or shared with others through platforms like Hugging Face's Model Hub.