In [4]:
import os
import json
import numpy as mp

# PyTorch for Model Implementation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from collections import Counter # Tokenization

In [5]:
!pip install datasets
!pip install huggingface_hub
from datasets import load_dataset


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [6]:
# Load the nq_open dataset from Hugging Face
dataset = load_dataset("google-research-datasets/nq_open")

# View the structure of the dataset
print(dataset)

# Check a sample from the training set
print("Sample from the training set:")
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 87925
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 3610
    })
})
Sample from the training set:
{'question': 'where did they film hot tub time machine', 'answer': ['Fernie Alpine Resort']}


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 87925
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 3610
    })
})

In [13]:
# prompt: Next Steps
# Preprocessing:
# Tokenize the questions and answers.
# Build a vocabulary of words/tokens.
# Convert text data into numerical formats for the model (e.g., word indices).

# Tokenization
# Create a vocabulary (you might want to use a pre-trained tokenizer instead for better results)
word_counts = Counter()
for example in dataset['train']:
    question = example['question']
    # Accessing the first element of the 'answer' list (assuming it contains the answer text)
    answer = example['answer'][0] if example['answer'] else ""  # Handle empty answers
    word_counts.update(question.lower().split())
    word_counts.update(answer.lower().split())

vocabulary = [word for word, count in word_counts.items() if count >= 5] # Minimum word frequency
word_to_index = {word: index for index, word in enumerate(vocabulary)}
index_to_word = {index: word for index, word in enumerate(vocabulary)}

# Convert text to numerical format
def convert_text_to_indices(text):
    # Check if text is a list and join it into a string
    if isinstance(text, list):
        text = ' '.join(text)
    tokens = text.lower().split()
    indices = [word_to_index.get(token, len(vocabulary)) for token in tokens] # Out-of-vocabulary tokens
    return indices

# Example usage
example_question = dataset['train'][0]['question']
example_indices = convert_text_to_indices(example_question)

print("Example question:", example_question)
print("Converted indices:", example_indices)


# --- Further preprocessing steps (optional) ---

# Pad sequences
max_length = 100  # Example value, adjust as needed
def pad_sequence(indices, max_len):
  if len(indices) > max_len:
    return indices[:max_len]
  else:
    return indices + [len(vocabulary)] * (max_len - len(indices)) # Add padding index for shorter sequences


# Example of Padding
example_padded_indices = pad_sequence(example_indices, max_length)
print("Padded indices:", example_padded_indices)


# Create dataset class and dataloader (for efficient batching)
class NQDataset(Dataset):
  def __init__(self, data, max_length):
    self.data = data
    self.max_length = max_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    item = self.data[idx]
    question = convert_text_to_indices(item['question'])
    answer = convert_text_to_indices(item['answer'])

    question = pad_sequence(question, self.max_length)
    answer = pad_sequence(answer, self.max_length)
    return torch.tensor(question), torch.tensor(answer) # Returns tensors


# Example of the Dataset Class
train_dataset = NQDataset(dataset['train'], max_length)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True) # Create DataLoader for batches

# Example usage of dataloader
for batch in train_dataloader:
  question_batch, answer_batch = batch
  print("Question shape", question_batch.shape)
  print("Answer shape", answer_batch.shape)
  break # Break to avoid printing many batches


Example question: where did they film hot tub time machine
Converted indices: [0, 1, 2, 3, 4, 12555, 5, 6]
Padded indices: [0, 1, 2, 3, 4, 12555, 5, 6, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555, 12555]
Question shape torch.Size([32, 100])
Answer shape torch.Size([32, 100])
