In [28]:
GPT_2_CONFIGURATION_124M={
    "vocab_size":50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "dropout":0.1,
    "qkv_bias":False
}

# layer normalization

In [2]:
import torch.nn as nn

In [3]:
torch.manual_seed(123)
batch_ex=torch.randn(2,5)
layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())
out=layer(batch_ex)
out


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [4]:
mean=out.mean(dim=-1,keepdim=True)
var=out.var(dim=-1,keepdim=True)

In [5]:
mean

tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)

In [6]:
var

tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)

In [7]:
norm_out=(out-mean)/(var**0.5)

In [8]:
norm_out

tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)

# layer norm class

In [9]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super(LayerNorm, self).__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
        
    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift
    

In [10]:
ln=LayerNorm(emb_dim=5)
out_ln=ln(batch_ex)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
out_ln

tensor([[ 0.4945,  0.9564, -0.0200,  0.2375, -1.6685],
        [ 0.8127, -1.2313, -0.8554,  1.0110,  0.2630]], grad_fn=<AddBackward0>)

In [12]:
out_ln.mean(dim=-1)

tensor([-1.4901e-08,  2.3842e-08], grad_fn=<MeanBackward1>)

# GeLU activation

In [13]:
import torch
import torch.nn as nn
import math

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
        # Precompute constant √(2/π) as a buffer for efficiency
        self.sqrt_2_pi = math.sqrt(2.0 / math.pi)

    def forward(self, x):
        # Ensure dtype and device compatibility
        sqrt_2_pi = torch.tensor(self.sqrt_2_pi, dtype=x.dtype, device=x.device)
        return 0.5 * x * (1 + torch.tanh(sqrt_2_pi * (x + 0.044715 * torch.pow(x, 3))))


In [14]:
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers=nn.Sequential(nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
                                 GELU(),
                                 nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"])
                                 )
        
    def forward(self,x):
        return self.layers(x)
        

In [15]:
ffn=FeedForward(GPT_2_CONFIGURATION_124M)

In [16]:
x=torch.randn(2,3,768)

In [17]:
ffn(x)

tensor([[[-0.3731, -0.2161,  0.1972,  ..., -0.2462,  0.0535,  0.2413],
         [ 0.0069,  0.0609,  0.3952,  ...,  0.1626, -0.0415, -0.1237],
         [ 0.1569, -0.1565, -0.0789,  ..., -0.3007,  0.2389, -0.1702]],

        [[ 0.2887,  0.0783,  0.1038,  ..., -0.2605, -0.0504, -0.2268],
         [-0.0889,  0.2274,  0.0563,  ..., -0.2062,  0.0148, -0.2420],
         [ 0.2520, -0.0005, -0.2848,  ..., -0.0739, -0.0354,  0.0410]]],
       grad_fn=<ViewBackward0>)

In [18]:
ffn(x).shape

torch.Size([2, 3, 768])

# example usage of skip connections

In [19]:
class DNN(nn.Module):
    def __init__(self,layer,use_shortcut):
        super().__init__()
        self.use_shortcut=use_shortcut
        self.layers=nn.ModuleList([
        nn.Sequential(nn.Linear(layer[0],layer[1]),GELU()),
        nn.Sequential(nn.Linear(layer[1],layer[2]),GELU()),
        nn.Sequential(nn.Linear(layer[2],layer[3]),GELU()),
        nn.Sequential(nn.Linear(layer[3],layer[4]),GELU()),
        nn.Sequential(nn.Linear(layer[4],layer[5]),GELU())
        ])
        

        
    def forward(self,x):
        for layer in self.layers:
            layer_output=layer(x)
            if self.use_shortcut and x.shape==layer_output.shape:
                x=x+layer_output
            else:
                x=layer_output
        return x
        
        

In [20]:
layer_size=[3,3,3,3,3,1]
sample_ip=torch.tensor([[1.,0.,-1.]])

op_wo_shortcut=DNN(layer_size,use_shortcut=False)
op=op_wo_shortcut(sample_ip)

In [21]:
op_w_shortcut=DNN(layer_size,use_shortcut=True)
op_=op_w_shortcut(sample_ip)

In [22]:
def print_grad(model, x):
    # Perform forward pass
    output = model(x)
    
    # Define the target tensor
    target = torch.zeros_like(output)
    
    # Compute loss
    loss_fn = nn.MSELoss()
    loss = loss_fn(output, target)
    
    # Clear previous gradients
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
    # Print gradient mean for weights
    for name, param in model.named_parameters():
        if 'weight' in name and param.grad is not None:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

    

In [23]:
print_grad(op_wo_shortcut,sample_ip)

layers.0.0.weight has gradient mean of 4.251542122801766e-06
layers.1.0.weight has gradient mean of 3.5761568142334e-05
layers.2.0.weight has gradient mean of 9.792059427127242e-05
layers.3.0.weight has gradient mean of 0.00019723761943168938
layers.4.0.weight has gradient mean of 0.0018265097169205546


In [24]:
print_grad(op_w_shortcut,sample_ip)

layers.0.0.weight has gradient mean of 0.5702555775642395
layers.1.0.weight has gradient mean of 0.7371833324432373
layers.2.0.weight has gradient mean of 0.8664534687995911
layers.3.0.weight has gradient mean of 0.6517516374588013
layers.4.0.weight has gradient mean of 4.082286834716797


# whole transformer block

In [25]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super(LayerNorm, self).__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
        
    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift
    
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
        # Precompute constant √(2/π) as a buffer for efficiency
        self.sqrt_2_pi = math.sqrt(2.0 / math.pi)

    def forward(self, x):
        # Ensure dtype and device compatibility
        sqrt_2_pi = torch.tensor(self.sqrt_2_pi, dtype=x.dtype, device=x.device)
        return 0.5 * x * (1 + torch.tanh(sqrt_2_pi * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers=nn.Sequential(nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
                                 GELU(),
                                 nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"])
                                 )
        
    def forward(self,x):
        return self.layers(x)
    
    

In [31]:
class Multihead_Attention_V2(nn.Module):
    def __init__(self,d_in,d_out,context_length,num_heads,dropout,qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        self.d_out=d_out
        self.num_heads=num_heads
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_keys=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_values=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.head_dim=d_out//num_heads
        self.out_proj=nn.Linear(d_out,d_out)
        self.dropout=nn.Dropout(dropout)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
        
    
    def forward(self,x):
        b,num_tokens,d_in=x.shape
        keys=self.W_keys(x)
        queries=self.W_query(x)
        values=self.W_values(x)
        #now change dimensions
        
        keys=keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries=queries.view(b,num_tokens,self.num_heads,self.head_dim)
        values=values.view(b,num_tokens,self.num_heads,self.head_dim)
        #group as num_heads
        keys=keys.transpose(1,2)
        queries=queries.transpose(1,2)
        values=values.transpose(1,2)
        
        attn_scores=queries @ keys.transpose(2,3)
        
        mask_bool=self.mask.bool()[:num_tokens, :num_tokens]  #if num_tokens is less than specified context length
        
        attn_scores.masked_fill_(mask_bool,-torch.inf)
        
        attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)
        attn_weights=self.dropout(attn_weights)
        
        context_vectors=(attn_weights @ values).transpose(1,2)
        context_vectors=context_vectors.contiguous().view(b,num_tokens,self.d_out)
        context_vectors=self.out_proj(context_vectors)
        
        return context_vectors
        
        
        

In [36]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.att=Multihead_Attention_V2(d_in=cfg["emb_dim"],
                                       d_out=cfg["emb_dim"],
                                       context_length=cfg["context_length"],
                                       num_heads=cfg["n_heads"],
                                       dropout=cfg["dropout"],
                                       qkv_bias=cfg["qkv_bias"])
        self.ff=FeedForward(cfg)
        self.norm1=LayerNorm(cfg["emb_dim"])
        self.norm2=LayerNorm(cfg["emb_dim"])
        self.drop_shortcut=nn.Dropout(cfg["dropout"])
    
    def forward(self,x):
        shortcut=x;
        x=self.norm1(x)
        x=self.att(x)
        x=self.drop_shortcut(x)
        x=x+shortcut
        
        shortcut=x
        x=self.norm2(x)
        x=self.ff(x)
        x=self.drop_shortcut(x)
        x=x+shortcut
        return x

In [37]:
torch.manual_seed(123)
x=torch.rand(2,4,768)
block=TransformerBlock(GPT_2_CONFIGURATION_124M)
output=block(x)


In [38]:
output

tensor([[[ 0.1649,  0.4003, -0.0746,  ...,  1.2644,  0.3327,  0.7242],
         [ 0.0295,  0.0499,  0.2529,  ...,  0.4699,  0.1284,  0.9746],
         [ 0.5534,  0.5785, -0.0309,  ...,  1.1541,  0.3949,  0.7598],
         [ 0.1631,  0.7129,  0.7272,  ...,  0.3312,  0.5731,  0.9255]],

        [[ 0.1788,  1.1680,  0.5809,  ...,  0.1828,  0.0076, -0.5598],
         [-0.2919,  0.6317,  0.2002,  ...,  0.3218,  0.4671, -0.0381],
         [ 0.9273,  0.4202,  0.3183,  ...,  0.3771,  0.7189, -0.1203],
         [ 0.6033,  0.5767,  0.3411,  ...,  1.3796,  1.2681,  0.3915]]],
       grad_fn=<AddBackward0>)

In [39]:
x

tensor([[[0.2961, 0.5166, 0.2517,  ..., 0.9541, 0.8567, 0.4604],
         [0.2238, 0.3047, 0.3019,  ..., 0.5465, 0.4532, 0.7598],
         [0.6945, 0.2478, 0.4111,  ..., 0.8838, 0.4898, 0.5963],
         [0.0890, 0.7804, 0.9223,  ..., 0.4507, 0.6357, 0.5833]],

        [[0.5716, 0.9297, 0.3396,  ..., 0.0477, 0.4564, 0.2797],
         [0.0936, 0.2211, 0.3806,  ..., 0.3948, 0.4545, 0.4536],
         [0.6788, 0.1741, 0.2084,  ..., 0.5557, 0.5930, 0.0959],
         [0.3894, 0.4083, 0.0662,  ..., 0.9861, 0.9341, 0.1319]]])