# Playground for GPT_Model

## Putting everything together

In [1]:
import torch
import torch.nn as nn

torch.set_printoptions(threshold=10, edgeitems=3)
torch.manual_seed(42)

# See these files for details
%run "03. Normalization.ipynb"
%run "05. TransformerBlock.ipynb"

MultiHeadAttention = MultiHeadAttention
LayerNorm = LayerNorm

## GPT Model
* 12 Transformers (n_layer)

In [2]:
class GPTModel(nn.Module):

    def __init__(self, cfg, verbose = False):
        super().__init__()

        self.vocab_size = cfg["vocab_size"]
        self.embbed_dim = cfg["emb_dim"]
        self.context_length = cfg["context_length"]
        self.drop_rate = cfg["drop_rate"]
        self.n_layers = cfg["n_layers"]

        self.tok_emb = nn.Embedding(self.vocab_size, self.embbed_dim)
        self.pos_emb = nn.Embedding(self.context_length, self.embbed_dim)
        self.drob_emb = nn.Dropout(self.drop_rate)

        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg=cfg) for _ in range(self.n_layers)])

        self.final_norm = LayerNorm(self.embbed_dim)
        self.out_head = nn.Linear(self.embbed_dim, self.vocab_size, bias=False)

        if verbose:
            print(f"\n=== GPTModel Initialization ===")
            print(f"    vocab_size =", self.vocab_size)
            print(f"    embbed_dim =", self.embbed_dim)
            print(f"    context_length =", self.context_length)
            print(f"    drop_rate =", self.drop_rate)
            print(f"    n_layers =", self.n_layers, " (number of Tranformer blocks)")
            print(f"    Generating nn.Embedding({self.vocab_size}, {self.embbed_dim}) weights for tokenID to embedding projection")
            print(f"    Generating nn.Embedding({self.context_length}, {self.embbed_dim}) weights for positional encoding")
            print(f"    Generating nn.Dropout({self.drop_rate})")
            print(f"    Generating nn.Sequential(*[TransformerBlock(cfg=cfg) for _ in range({self.n_layers})])")
            print(f"    Generating LayerNorm({self.embbed_dim}) for final normalization")
            print(f"    Generating out_head nn.Linear({self.embbed_dim}, {self.vocab_size}, bias=False) for final output generation")
            print(f"=== END GPTModel Initialization ===\n")
            

    def forward(self, token_ids, verbose=False):
        
        # local variables for input shape
        batch_size, context_length = token_ids.shape

        if verbose:
            print(f"\n=== GPTModel Forward Pass ===")
            print(f"Input shape: {token_ids.shape} (batch_size={batch_size}, context_length={context_length})")
            
        tok_emb = self.tok_emb(token_ids)
        pos_emb = self.pos_emb(torch.arange(context_length, device=token_ids.device))
        x = tok_emb + pos_emb

        if verbose:
            print(f'\ntok_emb[0] for x ({context_length} x {tok_emb.shape[-1]}):\n', tok_emb[0])
            print(f'\npos_emb[0] ({context_length} x {pos_emb.shape[-1]}):\n', pos_emb)
            print("\nx[0] = tok_emb[0] + pos_emb[0]:\n", x[0])
            print("\nShape for input_embeddings: batch, context, embedding_dim ", x.shape)

        x = self.drob_emb(x)
        if verbose: print(f'\nDropout on embedding:\n', x[0])
        
        x = self.trf_blocks[0](x, verbose=verbose) # Just generate verbose output for first block
        x = self.trf_blocks[1:](x)
        if verbose: print(f'\nAfter trf_blocks (only shows output for first one):\n', x[0])
        
        x = self.final_norm(x)
        if verbose: print(f'\nAfter final_norm:\n', x[0])
        
        logits = self.out_head(x)
        if verbose: print(f'\nFinal logits from out_head:\n', x[0])
        
        return logits





## Helper for context cropping and predicted token concatination

In [3]:
def generate_tokens(model, token_IDs, max_new_tokens, context_size, verbose):
    
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop stored context exceeds context_size
        token_ID_context = token_IDs[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(token_ID_context, verbose)   # GPTModel

        # (batch, n_token, vocab_size) -> (batch, vocab_size) last token only
        logits = logits[:, -1, :]
        predicted_token = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        token_IDs = torch.cat((token_IDs, predicted_token), dim=1)  # (batch, n_tokens+1)

        if verbose: print("\nStopping output after first run!!\n")
        verbose = False # Stop output after first run

    return token_IDs

In [4]:
import tiktoken

def test_run(verbose = False):

    torch.manual_seed(42)

    GPT_CONFIG_124M = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "emb_dim": 768,          # Embedding dimension
        "n_heads": 12,           # Number of attention heads
        "n_layers": 12,          # Number of layers
        "drop_rate": 0.1,        # Dropout rate
        "qkv_bias": False        # Query-Key-Value bias
    }

    model = GPTModel(GPT_CONFIG_124M, verbose=verbose)
    model.eval() # Disable dropout etc.

    start_context = "Hello world! I'm "

    tokenizer = tiktoken.get_encoding("gpt2")
    encoded = tokenizer.encode(start_context)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)

    print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}")
    print("\nInput text:", start_context)
    print("Encoded input text:", encoded)
    print("encoded_tensor.shape:", encoded_tensor.shape)
    print(f"\n{50*'='}")
    
    out = generate_tokens(
        model=model,
        token_IDs=encoded_tensor,
        max_new_tokens=5,
        context_size=GPT_CONFIG_124M["context_length"],
        verbose = verbose
    )
    decoded_text = tokenizer.decode(out.squeeze(0).tolist())

    print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}")
    print("\nOutput:", out)
    print("Output length:", len(out[0]))
    print("Output text:", decoded_text)
    
    return

if '__file__' not in dir(): _test_run = test_run(verbose=True)


=== GPTModel Initialization ===
    vocab_size = 50257
    embbed_dim = 768
    context_length = 1024
    drop_rate = 0.1
    n_layers = 12  (number of Tranformer blocks)
    Generating nn.Embedding(50257, 768) weights for tokenID to embedding projection
    Generating nn.Embedding(1024, 768) weights for positional encoding
    Generating nn.Dropout(0.1)
    Generating nn.Sequential(*[TransformerBlock(cfg=cfg) for _ in range(12)])
    Generating LayerNorm(768) for final normalization
    Generating out_head nn.Linear(768, 50257, bias=False) for final output generation
=== END GPTModel Initialization ===


                      IN

Input text: Hello world! I'm 
Encoded input text: [15496, 995, 0, 314, 1101, 220]
encoded_tensor.shape: torch.Size([1, 6])


=== GPTModel Forward Pass ===
Input shape: torch.Size([1, 6]) (batch_size=1, context_length=6)

tok_emb[0] for x (6 x 768):
 tensor([[ 1.7279,  0.8710,  0.4013,  ..., -0.5229,  0.9800,  0.4260],
        [ 0.5697,  0.0478,  1.5667,  ...