# Playground for
* Dataset
* Dataloader


## Dataset
  Reprocesses the entire dataset into training pairs, and `__getitem__` override for `dataset[ ]` just retrieves them by index.

  Process:
  1. Tokenize text → convert to token IDs
  2. Sliding window → extract sequences of context_length + 1
  3. Split each sequence → input `[i:context_length]` and target `[i+1:context_length+1]` (shifted by 1)
  4. Store as pairs → `self.input_ids[i]` and `self.target_ids[i]` as a training pair
  5. Jumps `stride` width

  Example:
  sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9]  # context_length 3, stride 5 

  # Split into:
  input_ids  = [1, 2, 3]   # predict next token at each position
  target_ids = [2, 3, 4]   # what should be predicted
  Jump 
  input_ids  = [6, 7, 8]   # predict next token at each position
  target_ids = [7, 8, 9]   # what should be predicted
  

  Training pairs at same index:
  - input_ids[0] → predict → target_ids[0]
  - input_ids[1] → predict → target_ids[1]
  - etc.

  This preprocessing in __init__ makes __getitem__ very fast since it just returns pre-computed pairs.
  The model learns to predict the next token at each position in the sequence.

In [114]:
import tiktoken # converts token into IDs and back. Tiktoken is a library openai we use sheme "gpt2"
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(42)

class GPTDataset(Dataset):

    def __init__(self, txt, tokenizer, context_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize txt
        token_ids = tokenizer.encode( txt, allowed_special={'<|endoftext|>'})

        # Walks throught IDs and prepaires training sets for every index of input_ids and target_ids
        # Stride determines the jump wide for one loop
        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i+context_length]
            target_chunk = token_ids[i+1:i+context_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


## Dataloader
Wrapper function for initialization and data loader creation
* initializes tokenizer with model name (gpt2)
* creates GPTDataset with text
* sums up batches (size 4)

In [115]:
def create_dataloader(txt, tokenizer_model_name="gpt2", batch_size=4, context_length=256, stride=128, shuffle=True):

    # Initialize tokenizer
    tokenizer = tiktoken.get_encoding(tokenizer_model_name)

    # Create dataset
    dataset = GPTDataset(txt, tokenizer, context_length, stride)

    # Create dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

## Token Embeddings
Returns input_embeddings from token_ids sequences (demonstration implementation)
* GPT2 uses a vocabulary size of 50252 different token-ids
* Every ID has a stored embedding vector of length 756
* Creates embedding layers for vocabulary and embedding dimensions (50252 x 756)
* Positional encoding from embedding layer with context_length and embedding dimension (4 x 756)

In [116]:
class Embedder(nn.Module):

    def __init__(self, vocab_size, context_length, embedding_dim, verbose=False):
        super().__init__()
        self.verbose = verbose
        
        self.positions = torch.arange(context_length) # tensor([0, 1, 2, 3]) for con_len 4 
        self.token_embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding_layer = nn.Embedding(context_length, embedding_dim)
        
        if self.verbose: 
            print(f'\nInit test embedder ...')
            print(f'Generating token_embeddings ({vocab_size} x {embedding_dim})')
            print(f'Generating pos_embeddings ({context_length} x {embedding_dim})\n')       

    def forward(self, x):

        con_len = x.shape[-1]

        embeddings = self.token_embedding_layer(x)
        pos_embeddings = self.pos_embedding_layer(self.positions[:con_len])
        input_embeddings = embeddings + pos_embeddings
        
        if self.verbose: 
            print(f'\nembeddings[0] for x ({con_len} x {embeddings.shape[-1]}):\n', embeddings[0])
            print(f'\npos_embeddings[0] ({con_len} x {pos_embeddings.shape[-1]}):\n', pos_embeddings)
            print("\ninput_embeddings[0] = embeddings[0] + pos_embeddings[0]:\n", input_embeddings[0])
            print("\nShape for input_embeddings: batch, context, embedding_dim ", input_embeddings.shape)

        return embeddings + pos_embeddings

## Generates test run with test data

In [None]:
def get_test_input_embedding(vocab_size=50252, embedding_dim=756, batch_size=8, context_length=4, stride=4, verbose=False):

    with open("Robins Small Text Sample.txt", "r", encoding="utf-8") as file:
        raw_text = file.read()

    dataloader = create_dataloader( raw_text, "gpt2", batch_size, context_length, stride)
    embedder = Embedder(vocab_size, context_length, embedding_dim, verbose=verbose)
    tokenizer = tiktoken.get_encoding("gpt2")   # for debuggin outputs

    batch = next(iter(dataloader))  # accessing a batch from dataloader
    x, y = batch

    if verbose:
        print("Displaying first row of batch")
        print("\nFirst batch elements Input x:\n", x[0], tokenizer.decode(x[0].tolist()))
        print("\nFirst batch elements Target y:\n", y[0], tokenizer.decode(y[0].tolist()))

    input_embeddings = embedder(x)
    return input_embeddings

_test_run = get_test_input_embedding()


Init test embedder ...
Generating token_embeddings (50252 x 756)
Generating pos_embeddings (4 x 756)

Displaying first row of batch

First batch elements Input x:
 tensor([1119, 3706,  262, 2126])  They named the idea

First batch elements Target y:
 tensor([3706,  262, 2126,  564])  named the idea �

embeddings[0] for x (4 x 756):
 tensor([[-1.6556, -0.2670,  1.9718,  ..., -1.8361,  1.9735, -0.9703],
        [-0.5782, -0.3741, -1.6356,  ...,  0.5111,  0.0787,  1.2359],
        [ 0.1340, -0.6718,  0.3508,  ...,  0.9140, -0.9966,  0.4846],
        [-0.2394, -0.3632,  0.1255,  ..., -0.8219,  0.7632, -0.1688]],
       grad_fn=<SelectBackward0>)

pos_embeddings[0] (4 x 756):
 tensor([[ 0.5709, -1.2685,  1.2267,  ..., -0.0676, -0.6672, -0.0866],
        [ 1.5393, -1.5907,  0.1437,  ...,  0.5361, -1.4736, -0.8745],
        [-0.3313, -0.0134,  0.2153,  ...,  0.0583,  0.0872, -1.6946],
        [-1.7298, -0.0249, -2.2558,  ...,  1.6201, -0.2423, -0.0243]],
       grad_fn=<EmbeddingBackward0>)
