In [1]:
import os
import json
import numpy as mp

# PyTorch for Model Implementation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from collections import Counter # Tokenization

In [2]:
!pip install datasets
!pip install huggingface_hub
from datasets import load_dataset


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
# Load the nq_open dataset from Hugging Face
dataset = load_dataset("google-research-datasets/nq_open")

# View the structure of the dataset
print(dataset)

# Check a sample from the training set
print("Sample from the training set:")
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 87925
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 3610
    })
})
Sample from the training set:
{'question': 'where did they film hot tub time machine', 'answer': ['Fernie Alpine Resort']}


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 87925
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 3610
    })
})

In [5]:


# Create a vocabulary from the dataset
word_counts = Counter()

# Iterate through the training data to collect word frequencies
for example in dataset['train']:
    question = example['question']  # Extract the question
    # Extract the first answer (assuming there's only one correct answer per example)
    answer = example['answer'][0] if example['answer'] else ""  # Handle cases with no answers
    word_counts.update(question.lower().split())  # Tokenize and count words in the question
    word_counts.update(answer.lower().split())  # Tokenize and count words in the answer

# Build the vocabulary, filtering out rare words (e.g., those appearing less than 5 times)
vocabulary = [word for word, count in word_counts.items() if count >= 5]  # Minimum word frequency
word_to_index = {word: index for index, word in enumerate(vocabulary)}  # Map words to indices
index_to_word = {index: word for index, word in enumerate(vocabulary)}  # Map indices to words

# Convert a text (question or answer) into a sequence of indices
def convert_text_to_indices(text):
    # If text is a list (like for answers), join it into a single string
    if isinstance(text, list):
        text = ' '.join(text)
    # Tokenize the text, convert tokens to indices, and handle out-of-vocabulary (OOV) tokens
    tokens = text.lower().split()  # Lowercase and split into words
    indices = [word_to_index.get(token, len(vocabulary)) for token in tokens]  # Assign OOV index for unknown words
    return indices

# Example usage: convert a sample question into indices
example_question = dataset['train'][0]['question']
example_indices = convert_text_to_indices(example_question)

print("Example question:", example_question)  # Display the original question
print("Converted indices:", example_indices)  # Display the indices representation of the question


# --- Optional: Pad sequences to ensure consistent lengths ---

# Define a maximum sequence length (adjust based on dataset characteristics)
max_length = 100

# Function to pad or truncate sequences to the desired length
def pad_sequence(indices, max_len):
    if len(indices) > max_len:
        # Truncate the sequence if it's longer than max_len
        return indices[:max_len]
    else:
        # Pad the sequence with a special padding index if it's shorter than max_len
        return indices + [len(vocabulary)] * (max_len - len(indices))

# Example usage: pad a sample sequence
example_padded_indices = pad_sequence(example_indices, max_length)
print("Padded indices:", example_padded_indices)  # Display the padded sequence


# Create a Dataset class to organize and preprocess data for model training
class NQDataset(Dataset):
    def __init__(self, data, max_length):
        """
        Initialize the dataset with raw data and the maximum sequence length.
        """
        self.data = data
        self.max_length = max_length

    def __len__(self):
        """
        Return the number of examples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieve and preprocess an example at a specific index.
        """
        item = self.data[idx]  # Get the example at the given index
        question = convert_text_to_indices(item['question'])  # Convert question to indices
        answer = convert_text_to_indices(item['answer'])  # Convert answer to indices

        # Pad the sequences to ensure consistent lengths
        question = pad_sequence(question, self.max_length)
        answer = pad_sequence(answer, self.max_length)
        return torch.tensor(question), torch.tensor(answer)  # Return the processed question and answer as tensors

# Create an instance of the Dataset class for the training data
train_dataset = NQDataset(dataset['train'], max_length)

# Create a DataLoader for batching and shuffling the training data
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Example usage of the DataLoader to fetch a batch of data
for batch in train_dataloader:
    question_batch, answer_batch = batch
    print("Question batch shape:", question_batch.shape)  # Display the shape of the batch
    print("Answer batch shape:", answer_batch.shape)  # Display the shape of the batch
    break  # Exit after processing the first batch


Example question: where did they film hot tub time machine
Converted indices: [0, 1, 2, 3, 4, 12555, 5, 6]
Padded indices: [0, 1, 2, 3, 4, 12555, 5, 6, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555]
Question batch shape: torch.Size([32, 100])
Answer batch shape: torch.Size([32, 100])


In [6]:
class QAModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, max_length):
        super(QAModel, self).__init__()

        # Embedding layer to map word indices to dense vectors
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=vocab_size)

        # Bidirectional LSTM for sequence processing
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Linear layers to predict start and end indices
        self.fc_start = nn.Linear(hidden_dim * 2, 1)  # BiLSTM output size is hidden_dim * 2
        self.fc_end = nn.Linear(hidden_dim * 2, 1)

        # Activation (log-softmax for numerical stability)
        self.log_softmax = nn.LogSoftmax(dim=1)

        # Store max sequence length
        self.max_length = max_length

    def forward(self, x):
        # Input shape: [batch_size, max_length]

        # Pass input through embedding layer
        embeddings = self.embedding(x)  # Output shape: [batch_size, max_length, embedding_dim]

        # Pass embeddings through BiLSTM
        lstm_out, _ = self.lstm(embeddings)  # Output shape: [batch_size, max_length, hidden_dim * 2]

        # Predict start indices
        start_logits = self.fc_start(lstm_out).squeeze(-1)  # Output shape: [batch_size, max_length]
        start_probs = self.log_softmax(start_logits)

        # Predict end indices
        end_logits = self.fc_end(lstm_out).squeeze(-1)  # Output shape: [batch_size, max_length]
        end_probs = self.log_softmax(end_logits)

        return start_probs, end_probs


In [7]:
# Model hyperparameters
vocab_size = len(vocabulary)  # Size of the vocabulary
embedding_dim = 128  # Dimension of word embeddings
hidden_dim = 256  # Dimension of LSTM hidden states
max_length = 100  # Maximum sequence length (same as padding length)

# Initialize the model
model = QAModel(vocab_size, embedding_dim, hidden_dim, max_length)

# Print the model architecture
print(model)


QAModel(
  (embedding): Embedding(12556, 128, padding_idx=12555)
  (lstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (fc_start): Linear(in_features=512, out_features=1, bias=True)
  (fc_end): Linear(in_features=512, out_features=1, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)


In [8]:
# Define the loss function for start and end index predictions
loss_fn = nn.CrossEntropyLoss()

# Initialize the optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 5  # Number of epochs to train
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()  # Set the model to training mode

    total_loss = 0  # Track the total loss for the epoch
    for batch in tqdm(train_dataloader):  # Iterate through batches
        question_batch, answer_batch = batch  # Get the questions and answers

        # Extract the ground truth start and end indices
        start_indices = torch.argmax(answer_batch, dim=1)  # Simulated (use real logic to extract these)
        end_indices = torch.argmax(answer_batch, dim=1)  # Simulated

        # Forward pass through the model
        start_probs, end_probs = model(question_batch)

        # Compute loss for start and end predictions
        start_loss = loss_fn(start_probs, start_indices)
        end_loss = loss_fn(end_probs, end_indices)

        # Combine the losses
        loss = start_loss + end_loss

        # Backpropagation
        optimizer.zero_grad()  # Clear gradients
        loss.backward()  # Compute gradients
        optimizer.step()  # Update model weights

        # Accumulate loss
        total_loss += loss.item()

    # Print the average loss for the epoch
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_dataloader):.4f}")


Epoch 1/5


100%|██████████| 2748/2748 [25:04<00:00,  1.83it/s]


Epoch 1 Loss: 3.0296
Epoch 2/5


100%|██████████| 2748/2748 [25:18<00:00,  1.81it/s]


Epoch 2 Loss: 2.8382
Epoch 3/5


100%|██████████| 2748/2748 [25:09<00:00,  1.82it/s]


Epoch 3 Loss: 2.6242
Epoch 4/5


100%|██████████| 2748/2748 [25:11<00:00,  1.82it/s]


Epoch 4 Loss: 2.2939
Epoch 5/5


100%|██████████| 2748/2748 [25:15<00:00,  1.81it/s]

Epoch 5 Loss: 1.8442



