In [None]:
# Core Python Libraries
import os  # For creating directories and managing file paths
import numpy as np  # For numerical operations, like working with arrays

# PyTorch Core Libraries
import torch  # Core PyTorch library for tensor operations
import torch.nn as nn  # For building neural network layers (e.g., Linear, Embedding)
import torch.optim as optim  # For optimization algorithms (e.g., Adam, SGD)

# PyTorch Data Utilities
from torch.utils.data import Dataset, DataLoader  # For handling datasets and batching

# PyTorch Transformer Modules
from torch.nn import TransformerEncoder, TransformerEncoderLayer  # Lightweight Transformer components



# Tokenizer Library
from transformers import AutoTokenizer  # For tokenizing text into input IDs for Transformer models

# Progress Bar Utility
from tqdm import tqdm  # For tracking progress in training and evaluation loops


In [None]:

!pip install datasets
!pip install huggingface_hub
# Hugging Face Datasets Library
from datasets import load_dataset  # For loading and preprocessing the SQuAD v2 dataset


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# Explanation of Imports
## 1. Core Python Libraries
os:
Used to create directories (e.g., for saving checkpoints).
Manages file paths during model deployment or saving intermediate files.
numpy:
Assists in mathematical operations, such as calculating evaluation metrics (e.g., F1 Score).
## 2. PyTorch Core Libraries
torch:
The main PyTorch library for creating tensors, managing computations, and handling GPU acceleration.
torch.nn:
Provides building blocks for neural networks (e.g., Linear, Embedding, Dropout).
torch.optim:
Contains optimization algorithms like Adam or SGD, which adjust model weights to minimize the loss function.
## 3. PyTorch Data Utilities
Dataset:
Used to create a custom dataset class for SQuAD v2.
Facilitates the preprocessing and management of training and validation data.
DataLoader:
Handles batching, shuffling, and parallel data loading to improve training efficiency.
## 4. PyTorch Transformer Modules
TransformerEncoder:
Implements the encoder part of the Transformer architecture.
Processes input sequences and outputs contextualized embeddings.
TransformerEncoderLayer:
A single layer of the Transformer encoder, including multi-head self-attention and feedforward sublayers.
## 5. Hugging Face Datasets Library
load_dataset:
Directly downloads and preprocesses the SQuAD v2 dataset from the Hugging Face datasets hub.
Splits the dataset into training and validation sets.
## 6. Tokenizer Library
AutoTokenizer:
A tokenizer class from Hugging Face's transformers library.
Converts text (e.g., context and question) into token IDs, segment IDs, and attention masks.
Works seamlessly with Transformer-based architectures.
## 7. Progress Bar Utility
tqdm:
Displays a progress bar during loops (e.g., training and validation).
Helps track time per epoch and overall progress.


In [None]:
# Load the SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Display the structure of the dataset
print("Dataset Structure:")
print(dataset)

# Inspect a sample from the training set
print("\nSample from the training set:")
print(dataset['train'][0])

# Inspect a sample from the validation set
print("\nSample from the validation set:")
print(dataset['validation'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

Sample from the training set:
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and fe

In [None]:
# Initialize the tokenizer (choose a lightweight model tokenizer, e.g., DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define maximum sequence length for context and question
max_length = 384  # Includes context, question, and special tokens

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_squad(example):
    """
    Preprocess a single example from the SQuAD v2 dataset.
    Args:
        example: A dictionary containing 'context', 'question', and 'answers'.
    Returns:
        A dictionary with tokenized inputs and start/end indices.
    """
    # Tokenize context and question
    tokenized = tokenizer(
        example["question"],
        example["context"],
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,  # To map token indices back to the original text
        return_tensors="pt",  # Return PyTorch tensors
    )

    # Get answer start and end positions
    answers = example["answers"]
    if len(answers["text"]) == 0:  # Unanswerable question
        start_positions = tokenizer.cls_token_id  # Use [CLS] token ID for unanswerable
        end_positions = tokenizer.cls_token_id
    else:
        # Get start and end character indices in the original context
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        # Map character positions to token indices
        offsets = tokenized["offset_mapping"][0]
        start_positions, end_positions = None, None
        for idx, (start, end) in enumerate(offsets):
            if start == start_char:
                start_positions = idx
            if end == end_char:
                end_positions = idx

    # If start or end position is not found, mark as unanswerable
    if start_positions is None or end_positions is None:
        start_positions = tokenizer.cls_token_id
        end_positions = tokenizer.cls_token_id

    # Remove offset mapping (not needed for training)
    tokenized.pop("offset_mapping")

    # Add start and end positions to the tokenized inputs
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions

    return tokenized

In [None]:
# Preprocess the dataset
tokenized_train = dataset["train"].map(preprocess_squad, batched=False)
tokenized_validation = dataset["validation"].map(preprocess_squad, batched=False)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
# Check a few samples from the tokenized training set
def verify_preprocessing(tokenized_dataset, original_dataset, tokenizer, num_samples=5):
    """
    Verify the correctness of the preprocessing step.

    Args:
        tokenized_dataset: The tokenized dataset.
        original_dataset: The original dataset.
        tokenizer: The tokenizer used for preprocessing.
        num_samples: Number of samples to verify.

    Returns:
        None
    """
    for idx in range(num_samples):
        # Get original and tokenized examples
        original = original_dataset[idx]
        tokenized = tokenized_dataset[idx]

        # Convert input_ids tensor to a list before decoding
        input_ids = tokenized["input_ids"]
        if isinstance(input_ids, torch.Tensor):
            input_ids = input_ids.tolist()

        # Decode tokenized context and question
        # Changed line to iterate through token IDs before decoding
        decoded_context = " ".join([tokenizer.decode(token_id, skip_special_tokens=True) for token_id in input_ids])

        question_input_ids = tokenizer(original["question"])["input_ids"]
        decoded_question = tokenizer.decode(question_input_ids, skip_special_tokens=True)

        # Display original and processed data
        print(f"Original Question: {original['question']}")
        print(f"Decoded Question: {decoded_question}")
        print(f"Original Context: {original['context'][:100]}...")  # Truncate for display
        print(f"Decoded Context: {decoded_context[:100]}...")
        print(f"Start Position (Token): {tokenized['start_positions']}")
        print(f"End Position (Token): {tokenized['end_positions']}")
        print("-" * 50)


# Verify preprocessing on the first few samples of the training dataset
verify_preprocessing(tokenized_train, dataset["train"], tokenizer, num_samples=5)


Original Question: When did Beyonce start becoming popular?
Decoded Question: when did beyonce start becoming popular?
Original Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American si...
Decoded Context: when did beyonce start becoming popular? beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon...
Start Position (Token): 75
End Position (Token): 78
--------------------------------------------------
Original Question: What areas did Beyonce compete in when she was growing up?
Decoded Question: what areas did beyonce compete in when she was growing up?
Original Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American si...
Decoded Context: what areas did beyonce compete in when she was growing up? beyonce giselle knowles - carter ( / biːˈ...
Start Position (Token): 68
End Position (Token): 70
--------------------------------------------------
Original Question: When did Beyonc