In [None]:
import numpy as np
import pandas as pd
import tiktoken as tk
import torch
import torch.nn as nn
import torch.nn.functional as F

### FeedForward Block

- **Input:** Tensor of shape `(B, T, d_model)`  
  - `B` = batch size  
  - `T` = sequence length  
  - `d_model` = embedding / hidden dimension of the model  

- **Operation:**
  1. Linear projection from `d_model` → `d_ff` (`fc1`)  
  2. Nonlinearity using GELU  
  3. Linear projection back from `d_ff` → `d_model` (`fc2`)  
  4. Dropout applied for regularization  

- **Output:** Tensor of same shape `(B, T, d_model)`  
  - Residuals are applied outside this block, so output can be directly added to input in TransformerBlock  


In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    
    def forward(self, x):
        x = self.fc1(x)         # (B, T, d_ff, shifted to the NN size)
        x = F.gelu(x)           # nonlinearity (GELU is used in GPTs as far as i know)
        x = self.fc2(x)         # (B, T, d_model, back to original size)
        x = self.dropout(x)     # dropout for regularization, VERY IMPORTANT !!!1!1!1
        return x

### TransformerBlock

- **Input:** Tensor of shape `(B, T, d_model)`  
  - `B` = batch size  
  - `T` = sequence length  
  - `d_model` = embedding / hidden dimension  

- **Components:**
  1. **LayerNorm (`ln1`)** before Multi-Head Attention  
  2. **Multi-Head Attention (`mha`)**  
     - Takes normalized input  
     - Output has shape `(B, T, d_model)`  
     - Residual connection added: `x = x + mha(...)`  
  3. **LayerNorm (`ln2`)** before FeedForward  
  4. **FeedForward (`ffn`)**  
     - Output shape `(B, T, d_model)`  
     - Residual connection applied  

- **Output:** Tensor of shape `(B, T, d_model)`  
  - Can be stacked in `nn.ModuleList` to build a full Transformer  


In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        
        self.ln1 = nn.LayerNorm(d_model)
        
        self.ln2 = nn.LayerNorm(d_model)
        
        self.mha = MultiHeadAttentionNew(d_model, num_heads)
        
        self.ffn = FeedForward(d_model, d_ff)

    def forward(self, x, mask=None):
        
        # Multi-head attention with residuals attached
        x = x + self.mha(self.ln1(x), mask=mask)

        ## Feed-forward with residual
        x = x + self.ffn(self.ln2(x))
        
        return x