In [2]:
import os
import json
import numpy as mp

# PyTorch for Model Implementation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from collections import Counter # Tokenization

In [3]:
!pip install datasets
!pip install huggingface_hub
from datasets import load_dataset


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
# Load the nq_open dataset from Hugging Face
dataset = load_dataset("google-research-datasets/nq_open")

# View the structure of the dataset
print(dataset)

# Check a sample from the training set
print("Sample from the training set:")
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 87925
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 3610
    })
})
Sample from the training set:
{'question': 'where did they film hot tub time machine', 'answer': ['Fernie Alpine Resort']}


In [5]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

In [6]:
# Initialize the WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Train the tokenizer on questions and answers in the dataset
trainer = WordPieceTrainer(
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]"],
    vocab_size=30522,  # Standard vocabulary size (can be adjusted)
)
texts = [
    example['question'] + " " + " ".join(example['answer'])
    for example in dataset['train']
]
tokenizer.train_from_iterator(texts, trainer)

# Save and reload the tokenizer (optional)
tokenizer.save("tokenizer.json")
tokenizer = Tokenizer.from_file("tokenizer.json")

# Function to convert text to subword token indices
def convert_text_to_indices_with_tokenizer(text):
    if isinstance(text, list):
        text = " ".join(text)  # Join if text is a list (e.g., answers)
    encoded = tokenizer.encode(text)  # Use the tokenizer for encoding
    return encoded.ids  # Return token IDs

# Pad or truncate sequences to a consistent length
max_length = 100  # Define a maximum sequence length
pad_token_id = tokenizer.token_to_id("[PAD]")  # Get the ID for the [PAD] token


def pad_sequence_with_tokenizer(indices, max_len):
    if len(indices) > max_len:
        return indices[:max_len]
    else:
        return indices + [pad_token_id] * (max_len - len(indices))


# Update Dataset class to use subword tokenization
class NQDatasetWithTokenizer(Dataset):
    def __init__(self, data, max_length):
        """
        Initialize the dataset with raw data and the maximum sequence length.
        """
        self.data = data
        self.max_length = max_length

    def __len__(self):
        """
        Return the number of examples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieve and preprocess an example at a specific index.
        """
        item = self.data[idx]  # Get the example at the given index
        question = convert_text_to_indices_with_tokenizer(item['question'])  # Convert question to token indices
        answer = convert_text_to_indices_with_tokenizer(item['answer'])  # Convert answer to token indices

        # Pad the sequences to ensure consistent lengths
        question = pad_sequence_with_tokenizer(question, self.max_length)
        answer = pad_sequence_with_tokenizer(answer, self.max_length)
        return torch.tensor(question), torch.tensor(answer)  # Return tensors


# Create an instance of the updated Dataset class for training data
train_dataset = NQDatasetWithTokenizer(dataset['train'], max_length)

# Create a DataLoader for batching and shuffling the training data
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Example usage of the DataLoader to fetch a batch of data
for batch in train_dataloader:
    question_batch, answer_batch = batch
    print("Question batch shape:", question_batch.shape)  # Shape: [batch_size, max_length]
    print("Answer batch shape:", answer_batch.shape)  # Shape: [batch_size, max_length]
    break  # Exit after processing the first batch

Question batch shape: torch.Size([32, 100])
Answer batch shape: torch.Size([32, 100])


In [10]:
class QAModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, max_length):
        super(QAModel, self).__init__()

        # Embedding layer to map word indices to dense vectors
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=vocab_size)

        # Bidirectional LSTM for sequence processing
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2)

        # Linear layers to predict start and end indices
        self.fc_start = nn.Linear(hidden_dim * 2, 1)  # BiLSTM output size is hidden_dim * 2
        self.fc_end = nn.Linear(hidden_dim * 2, 1)

        # Activation (log-softmax for numerical stability)
        self.log_softmax = nn.LogSoftmax(dim=1)

        # Store max sequence length
        self.max_length = max_length

    def forward(self, x):
        # Input shape: [batch_size, max_length]

        # Pass input through embedding layer
        embeddings = self.embedding(x)  # Output shape: [batch_size, max_length, embedding_dim]

        # Pass embeddings through BiLSTM
        lstm_out, _ = self.lstm(embeddings)  # Output shape: [batch_size, max_length, hidden_dim * 2]

        # Predict start indices
        start_logits = self.fc_start(lstm_out).squeeze(-1)  # Output shape: [batch_size, max_length]
        start_probs = self.log_softmax(start_logits)

        # Predict end indices
        end_logits = self.fc_end(lstm_out).squeeze(-1)  # Output shape: [batch_size, max_length]
        end_probs = self.log_softmax(end_logits)

        return start_probs, end_probs


In [11]:
# Model hyperparameters
# Get the vocabulary from the tokenizer
vocabulary = tokenizer.get_vocab()  # Get the vocabulary from the trained tokenizer

vocab_size = len(vocabulary)  # Size of the vocabulary
embedding_dim = 128  # Dimension of word embeddings
hidden_dim = 256  # Dimension of LSTM hidden states
max_length = 100  # Maximum sequence length (same as padding length)

# Initialize the model
model = QAModel(vocab_size, embedding_dim, hidden_dim, max_length)

# Print the model architecture
print(model)

QAModel(
  (embedding): Embedding(30523, 128, padding_idx=30522)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc_start): Linear(in_features=512, out_features=1, bias=True)
  (fc_end): Linear(in_features=512, out_features=1, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)


In [15]:
# Use a subset of the training data for quick debugging
train_subset = dataset['train'].select(range(5000))  # Select first 5000 examples
train_dataloader = DataLoader(NQDatasetWithTokenizer(train_subset, max_length), batch_size=32, shuffle=True)

# Use a subset of validation data
val_subset = dataset['validation'].select(range(1000))  # Select first 1000 examples
val_dataloader = DataLoader(NQDatasetWithTokenizer(val_subset, max_length), batch_size=32)

In [16]:
import os

# Directory to save model checkpoints
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Define training loop with validation and checkpoints
def train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    loss_fn,
    num_epochs=5,
    checkpoint_dir="checkpoints"
):
    best_val_loss = float("inf")  # Track the best validation loss

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()  # Set the model to training mode

        total_loss = 0  # Track training loss for this epoch
        for batch_idx, batch in enumerate(train_dataloader):
            question_batch, answer_batch = batch  # Fetch batch

            # Extract true start and end indices (simulated here, replace with real logic)
            start_indices = torch.argmax(answer_batch, dim=1)
            end_indices = torch.argmax(answer_batch, dim=1)

            # Forward pass
            start_probs, end_probs = model(question_batch)

            # Compute loss
            start_loss = loss_fn(start_probs, start_indices)
            end_loss = loss_fn(end_probs, end_indices)
            loss = start_loss + end_loss

            # Backpropagation
            optimizer.zero_grad()  # Clear gradients
            loss.backward()  # Compute gradients
            optimizer.step()  # Update weights

            total_loss += loss.item()

            # Print progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f"Batch {batch_idx + 1}/{len(train_dataloader)}: Loss = {loss.item():.4f}")

        # Average loss for the epoch
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1} Training Loss: {avg_train_loss:.4f}")

        # Validate the model
        val_loss, _, _ = evaluate_model(model, val_dataloader, loss_fn)
        print(f"Epoch {epoch + 1} Validation Loss: {val_loss:.4f}")

        # Save the model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(checkpoint_dir, f"best_model_epoch_{epoch + 1}.pth"))
            print("Model checkpoint saved.")

    print("Training Complete.")

# Train the model
train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    num_epochs=5,
    checkpoint_dir=checkpoint_dir,
)


Epoch 1/5
Batch 100/157: Loss = 2.6834
Epoch 1 Training Loss: 2.6047


100%|██████████| 32/32 [00:13<00:00,  2.42it/s]


Validation Loss: 4.1057
Exact Match (EM): 37.70%
F1 Score: 37.81%
Epoch 1 Validation Loss: 4.1057
Model checkpoint saved.
Epoch 2/5
Batch 100/157: Loss = 2.3915
Epoch 2 Training Loss: 2.0876


100%|██████████| 32/32 [00:13<00:00,  2.40it/s]


Validation Loss: 4.4948
Exact Match (EM): 33.90%
F1 Score: 34.13%
Epoch 2 Validation Loss: 4.4948
Epoch 3/5
Batch 100/157: Loss = 0.8616
Epoch 3 Training Loss: 1.2946


100%|██████████| 32/32 [00:12<00:00,  2.48it/s]


Validation Loss: 6.0445
Exact Match (EM): 33.00%
F1 Score: 33.10%
Epoch 3 Validation Loss: 6.0445
Epoch 4/5
Batch 100/157: Loss = 0.4604
Epoch 4 Training Loss: 0.5460


100%|██████████| 32/32 [00:13<00:00,  2.46it/s]


Validation Loss: 7.8341
Exact Match (EM): 32.50%
F1 Score: 32.62%
Epoch 4 Validation Loss: 7.8341
Epoch 5/5
Batch 100/157: Loss = 0.1426
Epoch 5 Training Loss: 0.2202


100%|██████████| 32/32 [00:13<00:00,  2.41it/s]

Validation Loss: 9.5762
Exact Match (EM): 30.70%
F1 Score: 30.82%
Epoch 5 Validation Loss: 9.5762
Training Complete.





In [13]:
def evaluate_model(model, dataloader, loss_fn):
    """
    Evaluate the model on validation data and compute metrics.
    """
    model.eval()  # Set the model to evaluation mode
    total_loss = 0  # Track total loss
    exact_match = 0  # Track exact matches
    f1_total = 0  # Track total F1 score
    total_examples = 0  # Count total examples

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch in tqdm(dataloader):
            question_batch, answer_batch = batch  # Get batch data

            # Extract true start and end indices
            start_indices = torch.argmax(answer_batch, dim=1)  # Simulated (replace with real logic)
            end_indices = torch.argmax(answer_batch, dim=1)  # Simulated

            # Forward pass
            start_probs, end_probs = model(question_batch)

            # Compute loss
            start_loss = loss_fn(start_probs, start_indices)
            end_loss = loss_fn(end_probs, end_indices)
            loss = start_loss + end_loss
            total_loss += loss.item()

            # Convert predictions to actual indices
            predicted_start = torch.argmax(start_probs, dim=1)
            predicted_end = torch.argmax(end_probs, dim=1)

            # Calculate Exact Match (EM) and F1
            for i in range(len(question_batch)):
                true_start, true_end = start_indices[i].item(), end_indices[i].item()
                pred_start, pred_end = predicted_start[i].item(), predicted_end[i].item()

                # Check for exact match
                if true_start == pred_start and true_end == pred_end:
                    exact_match += 1

                # Compute F1 score
                true_set = set(range(true_start, true_end + 1))
                pred_set = set(range(pred_start, pred_end + 1))
                overlap = len(true_set & pred_set)
                if overlap == 0:
                    f1 = 0
                else:
                    precision = overlap / len(pred_set)
                    recall = overlap / len(true_set)
                    f1 = 2 * (precision * recall) / (precision + recall)
                f1_total += f1

            total_examples += len(question_batch)

    avg_loss = total_loss / len(dataloader)
    em_score = exact_match / total_examples * 100
    f1_score = f1_total / total_examples * 100

    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Exact Match (EM): {em_score:.2f}%")
    print(f"F1 Score: {f1_score:.2f}%")
    return avg_loss, em_score, f1_score


# Run the evaluation on the validation set
val_dataset = NQDataset(dataset['validation'], max_length)
val_dataloader = DataLoader(val_dataset, batch_size=32)

val_loss, val_em, val_f1 = evaluate_model(model, val_dataloader, loss_fn)


NameError: name 'NQDataset' is not defined