# Playground for
* Dataset
* Dataloader


## Dataset
  Reprocesses the entire dataset into training pairs, and `__getitem__` override for `dataset[ ]` just retrieves them by index.

  Process:
  1. Tokenize text → convert to token IDs
  2. Sliding window → extract sequences of context_length + 1
  3. Split each sequence → input `[i:context_length]` and target `[i+1:context_length+1]` (shifted by 1)
  4. Store as pairs → `self.input_ids[i]` and `self.target_ids[i]` as a training pair
  5. Jumps `stride` width

  Example:
  sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9]  # context_length 3, stride 5 

  # Split into:
  input_ids  = [1, 2, 3]   # predict next token at each position
  target_ids = [2, 3, 4]   # what should be predicted
  Jump 
  input_ids  = [6, 7, 8]   # predict next token at each position
  target_ids = [7, 8, 9]   # what should be predicted
  

  Training pairs at same index:
  - input_ids[0] → predict → target_ids[0]
  - input_ids[1] → predict → target_ids[1]
  - etc.

  This preprocessing in __init__ makes __getitem__ very fast since it just returns pre-computed pairs.
  The model learns to predict the next token at each position in the sequence.

In [17]:
import tiktoken # converts token into IDs and back. Tiktoken is a library openai we use sheme "gpt2"
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

torch.set_printoptions(threshold=10, edgeitems=3)
torch.manual_seed(42)

class GPTDataset(Dataset):

    def __init__(self, txt, tokenizer, context_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize txt
        token_ids = tokenizer.encode( txt, allowed_special={'<|endoftext|>'})

        # Walks throught IDs and prepaires training sets for every index of input_ids and target_ids
        # Stride determines the jump wide for one loop
        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i+context_length]
            target_chunk = token_ids[i+1:i+context_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


## Dataloader
Wrapper function for initialization and data loader creation
* initializes tokenizer with model name (gpt2)
* creates GPTDataset with text
* sums up batches (size 4)

In [18]:
def create_dataloader(txt, tokenizer_model_name="gpt2", batch_size=4, context_length=768, stride=128, shuffle=True):

    # Initialize tokenizer
    tokenizer = tiktoken.get_encoding(tokenizer_model_name)

    # Create dataset
    dataset = GPTDataset(txt, tokenizer, context_length, stride)

    # Create dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

## Token Embeddings
Returns input_embeddings from token_ids sequences (demonstration implementation)
* GPT2 uses a vocabulary size of 50252 different token-ids
* Every ID has a stored embedding vector of length 756
* Creates embedding layers for vocabulary and embedding dimensions (50252 x 756)
* Positional encoding from embedding layer with context_length and embedding dimension (4 x 756)

In [19]:
## This embedder is only for demonstration and test data generation. Not used in the final GPT Model

class Embedder(nn.Module):

    def __init__(self, vocab_size, context_length, embedding_dim, verbose=False):
        super().__init__()
        
        self.positions = torch.arange(context_length) # tensor([0, 1, 2, 3]) for con_len 4 
        self.token_embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding_layer = nn.Embedding(context_length, embedding_dim)
        
        if verbose: 
            print(f"\n=== Embedder Initialization ===")
            print(f'    vocab_size = ', vocab_size)
            print(f'    context_length = ', context_length)
            print(f'    embedding_dim = ', embedding_dim)
            print(f'    Generating token_embeddings ({vocab_size} x {embedding_dim})')
            print(f'    Generating pos_embeddings ({context_length} x {embedding_dim})')       
            print(f"=== End Initialization ===\n")
            
    def forward(self, x, verbose=False):

        con_len = x.shape[-1]

        embeddings = self.token_embedding_layer(x)
        pos_embeddings = self.pos_embedding_layer(self.positions[:con_len])
        input_embeddings = embeddings + pos_embeddings
        
        if verbose: 
            print(f"\n=== Embedder Forward Pass ===")
            print(f'\nembeddings[0] for x ({con_len} x {embeddings.shape[-1]}):\n', embeddings[0])
            print(f'\npos_embeddings[0] ({con_len} x {pos_embeddings.shape[-1]}):\n', pos_embeddings)
            print("\ninput_embeddings[0] = embeddings[0] + pos_embeddings[0]:\n", input_embeddings[0])
            print("\nShape for input_embeddings: batch, context, embedding_dim ", input_embeddings.shape)
            print(f"=== End Forward Pass ===\n")
            
        return embeddings + pos_embeddings

## Generates test run with test data

In [20]:
def get_test_input_embedding(vocab_size=50252, embedding_dim=768, batch_size=8, context_length=4, stride=4, verbose=False):

    with open("00. Robins Small Text Sample.txt", "r", encoding="utf-8") as file:
        raw_text = file.read()

    dataloader = create_dataloader( raw_text, "gpt2", batch_size, context_length, stride)
    embedder = Embedder(vocab_size, context_length, embedding_dim, verbose=verbose)
    tokenizer = tiktoken.get_encoding("gpt2")   # for debuggin outputs

    batch = next(iter(dataloader))  # accessing a batch from dataloader
    x, y = batch

    if verbose:
        print("Displaying first row of batch")
        print("\nFirst batch elements Input x:\n", x[0], tokenizer.decode(x[0].tolist()))
        print("\nFirst batch elements Target y:\n", y[0], tokenizer.decode(y[0].tolist()))

    input_embeddings = embedder(x, verbose=verbose)
    return input_embeddings

if '__file__' not in dir(): _test_run = get_test_input_embedding(verbose=True)


=== Embedder Initialization ===
    vocab_size =  50252
    context_length =  4
    embedding_dim =  768
    Generating token_embeddings (50252 x 768)
    Generating pos_embeddings (4 x 768)
=== End Initialization ===

Displaying first row of batch

First batch elements Input x:
 tensor([15424,   373,   257,  5909])  archive was a vast

First batch elements Target y:
 tensor([  373,   257,  5909, 16099])  was a vast repository

=== Embedder Forward Pass ===

embeddings[0] for x (4 x 768):
 tensor([[-0.9710, -0.7524, -0.8731,  ...,  0.7471, -0.9052, -0.2762],
        [-1.9231, -0.6952, -1.9170,  ..., -1.5696, -0.5434,  0.5664],
        [-0.4960, -1.1091, -0.4747,  ...,  0.8321,  0.0589,  1.5222],
        [-0.7470, -0.4200, -0.0747,  ...,  1.1139,  0.2141, -0.1558]],
       grad_fn=<SelectBackward0>)

pos_embeddings[0] (4 x 768):
 tensor([[ 0.6610, -1.4272,  2.4605,  ...,  1.2418, -1.1110,  1.0747],
        [-1.3963, -0.0800,  1.0716,  ..., -0.6346,  0.0893,  0.6827],
        [-0.2487, 