In [None]:
import numpy as np
import pandas as pd
import tiktoken as tk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Transformer

- **Input:** Tensor of shape `(B, T)`  
  - `B` = batch size  
  - `T` = sequence length  
  - Each element is a token ID  

- **Components:**
  1. **EmbeddingLayer (`embed`)**  
     - Converts token IDs into embeddings `(B, T, d_model)`  
     - Adds positional encodings  
  2. **Stack of `TransformerBlock`s (`blocks`)**  
     - Each block outputs `(B, T, d_model)`  
     - Includes Multi-Head Attention + FeedForward with residuals  
  3. **Final LayerNorm (`ln_final`)**  
     - Normalizes hidden states across `d_model` dimension  
  4. **Output Linear Layer (`head`)**  
     - Maps embeddings to vocabulary logits `(B, T, vocab_size)`  

- **Output:** Tensor of shape `(B, T, vocab_size)`  
  - Can be used for next-token prediction or language modeling  


In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=8, d_ff=1024, num_layers=7):
        super().__init__()
        
        self.embed = EmbeddingLayer(vocab_size, d_model, max_len=512)
        
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff)
            for _ in range(num_layers)
        ])
        
        self.ln_final = nn.LayerNorm(d_model)
        
        self.head = nn.Linear(d_model, vocab_size, bias=False)  # output logits

    def forward(self, x, mask=None):
        
        x = self.embed(x)
        
        for block in self.blocks:
            x = block(x, mask=mask)
        
        x = self.ln_final(x)
        
        return self.head(x)
