# Playground for TransformerBlock

## Central Component of GPT Model

In [1]:
import torch
import torch.nn as nn

torch.set_printoptions(threshold=10, edgeitems=3)
torch.manual_seed(42)

# Look into these for details
%run "02. MultiHeadAttention.ipynb"
%run "03. Normalization.ipynb"
%run "04. FeedForward.ipynb"

MultiHeadAttention = MultiHeadAttention
LayerNorm = LayerNorm
FeedForward = FeedForward

In [2]:
class TransformerBlock(nn.Module):

    def __init__(self, cfg, verbose=False):
        super().__init__()

        if verbose: print(f"\n=== Transformer Initialization ===")
        
        self.embbed_dim = cfg["emb_dim"]
        self.context_length = cfg["context_length"]
        self.num_heads = cfg["n_heads"]
        self.dropout_rate = cfg["drop_rate"]
        self.qkv_bias = cfg["qkv_bias"]

        self.att = MultiHeadAttention(
            input_dim = self.embbed_dim,
            output_dim = self.embbed_dim,
            context_length = self.context_length,
            dropout = self.dropout_rate,
            num_heads = self.num_heads,
            qkv_bias = self.qkv_bias,
            verbose = verbose
        )

        self.ffn = FeedForward(self.embbed_dim, verbose=verbose)
        self.norm1 = LayerNorm(self.embbed_dim, verbose=verbose)
        self.norm2 = LayerNorm(self.embbed_dim, verbose=verbose)
        self.dropout = nn.Dropout(self.dropout_rate)
        
        if verbose:
            print("Dropout rate: ", self.dropout_rate) 
            print(f"\n=== End Transformer Initialization ===")
        

    def forward(self, x, verbose = False):

        # local variables for input shape
        batch_size, context_length, input_dim = x.shape

        if verbose:
            print(f"\n=== TransformerBlock Forward Pass ===")
            print(f"Input shape: {x.shape} (batch_size={batch_size}, context_length={context_length}, input_dim={input_dim})")
            print(f"Config: num_heads={self.num_heads}, embbed_dim={self.embbed_dim}")
            print(f"\nInput tensor (=shortcut) (batch 0 with shape {x[0].shape}):")
            print(f"States for first batch ...\n {x[0]}")
            
        shortcut = x
        
        x = self.norm1(x)
        if verbose: print(f"\n1. Normalization 1:\n {x[0]}")
 
        x = self.att(x, verbose = verbose)
        if verbose: print(f"\n2. Attention:\n {x[0]}")
 
        x = self.dropout(x)
        if verbose: print(f"\n3. Dropout:\n {x[0]}")
        
        x = x + shortcut
        shortcut = x
        if verbose: print(f"\n4. Output + Shortcut (= new Shortcut):\n {x[0]}")
 
        x = self.norm2(x)
        if verbose: print(f"\n5. Normalization 2:\n {x[0]}")
        
        x = self.ffn(x, verbose = verbose)
        if verbose: print(f"\n6. FeedForward:\n {x[0]}")
        
        x = self.dropout(x)
        if verbose: print(f"\n7. Dropout:\n {x[0]}")
        
        x = x + shortcut
        if verbose: 
            print(f"\n8. Output + new Shortcut:\n {x[0]}")
            print(f"\n===END TransformerBlock Forward Pass ===\n")
        
        return x    


## Test Run

In [3]:
def use_transformer_block(verbose = False):

    GPT_CONFIG_124M = {
        "vocab_size": 50257,     # Vocabulary size
        "context_length": 1024,  # Context length
        "emb_dim": 768,          # Embedding dimension
        "n_heads": 12,           # Number of attention heads
        "n_layers": 12,          # Number of layers
        "drop_rate": 0.1,        # Dropout rate
        "qkv_bias": False        # Query-Key-Value bias
    }

    %run "01. DataPreparation.ipynb"
    input = get_test_input_embedding(verbose=verbose)

    block = TransformerBlock(cfg = GPT_CONFIG_124M, verbose=verbose)
    y = block(input, verbose=True)
    
    if verbose:
        print("Output shape: ", y.shape)

if '__file__' not in dir(): _test_run = use_transformer_block(True)


=== Embedder Initialization ===
    vocab_size =  50252
    context_length =  4
    embedding_dim =  768
    Generating token_embeddings (50252 x 768)
    Generating pos_embeddings (4 x 768)
=== End Initialization ===

Displaying first row of batch

First batch elements Input x:
 tensor([15424,   373,   257,  5909])  archive was a vast

First batch elements Target y:
 tensor([  373,   257,  5909, 16099])  was a vast repository

=== Embedder Forward Pass ===

embeddings[0] for x (4 x 768):
 tensor([[-0.9710, -0.7524, -0.8731,  ...,  0.7471, -0.9052, -0.2762],
        [-1.9231, -0.6952, -1.9170,  ..., -1.5696, -0.5434,  0.5664],
        [-0.4960, -1.1091, -0.4747,  ...,  0.8321,  0.0589,  1.5222],
        [-0.7470, -0.4200, -0.0747,  ...,  1.1139,  0.2141, -0.1558]],
       grad_fn=<SelectBackward0>)

pos_embeddings[0] (4 x 768):
 tensor([[ 0.6610, -1.4272,  2.4605,  ...,  1.2418, -1.1110,  1.0747],
        [-1.3963, -0.0800,  1.0716,  ..., -0.6346,  0.0893,  0.6827],
        [-0.2487, 