# Import dependencies

In [2]:
from datasets import load_dataset # Hugging Face library for accessing and streaming large public text datasets
from transformers import AutoTokenizer # Provides transformer-compatible tokenizers
from torch.utils.data import Dataset, DataLoader # PyTorch utilities for building custom datasets and data loaders
from torch.nn.utils.rnn import pad_sequence # Function to pad variable-length sequences in a batch to the same length
from tqdm import tqdm # Progress-bar library for displaying iteration progress in loops
import torch, regex as re, unicodedata # PyTorch core library
# regex: advanced regular-expression library used for text cleaning and normalization
# unicodedata: handles Unicode normalization to ensure consistent text representation

  from .autonotebook import tqdm as notebook_tqdm


# Dataset collection

In [3]:
wiki = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
# Load the English Wikipedia dataset (March 2022 dump) from the Hugging Face Hub.
owt  = load_dataset("openwebtext", split="train", streaming=True)
# Load the OpenWebText dataset — a large collection of web pages
texts = []
# Create an empty list to hold text samples collected from both datasets.

# Collect small sample from each source

In [4]:
for i, t in enumerate(wiki):
    texts.append(t["text"])
    if i >= 999:
        break
for i, t in enumerate(owt):
    texts.append(t["text"])
    if i >= 999:
        break
#extract main part of article as plain string, add it to texts
#take the first 1000 samples for demonstration purposes.
print(f"✅ Loaded {len(texts)} documents (streaming mode, no disk write)")

✅ Loaded 2000 documents (streaming mode, no disk write)


# Cleaning and normalization

In [5]:
#Normalize and clean a raw text string.
def clean_text(t: str) -> str:
    """Normalize text: lowercase, remove HTML, extra whitespace, long repeats."""
    t = unicodedata.normalize("NFKC", t)    # normalize unicode form
    t = t.lower()                           # convert to lowercase
    t = re.sub(r"<[^>]+>", " ", t)          # remove HTML tags
    t = re.sub(r"\s+", " ", t).strip()      # collapse spaces
    t = re.sub(r"(.)\1{4,}", r"\1\1\1", t)  # compress long repeats
    return t

#Initialize an empty list for cleaned documents, and a set to keep track of seen texts
cleaned, seen = [], set()

for txt in texts:
    # Skip empty or very short documents (< 50 words)
    if not txt or len(txt.split()) < 50:
        continue
    txt = clean_text(txt)
    if txt in seen:
        continue
    seen.add(txt)
    cleaned.append(txt)

print(f"✅ After cleaning: {len(cleaned)} unique documents")

✅ After cleaning: 1985 unique documents


# Tokenization and chunking

In [6]:
# Load a pretrained GPT-2 tokenizer from the Hugging Face Transformers library.
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Padding tokens are needed when batching sequences of different lengths.
# Here we assign the padding token to be the same as the EOS
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
# Define the maximum sequence length
block_size = 128
# Initialize an empty list to store tokenized blocks (lists of token IDs).
tokenized_blocks = []

# Iterate through each cleaned document and tokenize it.
# tqdm provides a progress bar for monitoring the tokenization process.
for t in tqdm(cleaned, desc="Tokenizing"):
    # Convert text to a list of integer token IDs.
    ids = tokenizer.encode(t, add_special_tokens=False)

    # Append the EOS token ID at the end of each document, to indicate the end of a sequence.
    ids.append(tokenizer.eos_token_id)

    # If a document is longer than block_size, we split it into consecutive chunks of equal length
    # This ensures each training sample fits within the model's maximum context window.
    for i in range(0, len(ids), block_size):
        tokenized_blocks.append(ids[i:i+block_size])

print(f"✅ Created {len(tokenized_blocks)} tokenized chunks")

Tokenizing:   0%|                                      | 0/1985 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8144 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing: 100%|██████████████████████████| 1985/1985 [00:08<00:00, 229.16it/s]

✅ Created 49302 tokenized chunks





# Custom PyTorch DataLoader

In [7]:
# Define a simple PyTorch Dataset to wrap tokenized text data
# Each item in the dataset corresponds to one tokenized block (list of token IDs).
class TextDataset(Dataset):
    """Simple tokenized dataset for pretraining."""
    # Store all tokenized blocks inside the dataset.
    def __init__(self, data):
        self.data = data
        
    # Return the total number of samples (blocks) in the dataset.
    def __len__(self):
        return len(self.data)

     # Retrieve a tokenized block by its index.
    def __getitem__(self, idx):
        ids = torch.tensor(self.data[idx], dtype=torch.long)
        return {"input_ids": ids, "labels": ids.clone()}

# Define a custom collate function to handle variable-length sequences in each batch
def collate_fn(batch):
    """Pad variable-length sequences in a batch."""
    # Extract the 'input_ids' tensors from all samples in this batch.
    ids = [b["input_ids"] for b in batch]
    # Use PyTorch's pad_sequence() to pad shorter sequences, so that all sequences in the batch have the same length.
    padded = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    return {"input_ids": padded, "labels": padded.clone()}

# Initialize the custom Dataset using the tokenized data blocks.
dataset = TextDataset(tokenized_blocks)
# Create a DataLoader to generate mini-batches for training.
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Iterate through the DataLoader to visualize the batch shapes.

for i, batch in enumerate(loader):
    # Print the shape of 'input_ids' for each batch.
    print(f"Batch {i} shapes: {batch['input_ids'].shape}")
    if i == 5:
        break


Batch 0 shapes: torch.Size([4, 128])
Batch 1 shapes: torch.Size([4, 128])
Batch 2 shapes: torch.Size([4, 128])
Batch 3 shapes: torch.Size([4, 128])
Batch 4 shapes: torch.Size([4, 128])
Batch 5 shapes: torch.Size([4, 128])


# Save sample processed data

In [8]:
for batch in loader:
    torch.save(batch, "sample_dataset.pt")
    print("✅ Saved sample batch to sample_dataset.pt")
    break

print("🎉 All steps completed successfully (streaming version ready for submission)!")

✅ Saved sample batch to sample_dataset.pt
🎉 All steps completed successfully (streaming version ready for submission)!
