In [1]:
GPT_CONFIG_124M = {
    "vocab_size":50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

In [2]:
import torch
import torch.nn as nn

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self , d_in , d_out , context_length , n_heads , dropout , qkv_bias=False):
        super().__init__()
        assert(d_out % n_heads == 0), \
        "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.n_heads = n_heads
        self.head_dim = d_out // n_heads

        self.W_query = nn.Linear(d_in , d_out , bias = qkv_bias)
        self.W_key = nn.Linear(d_in , d_out , bias = qkv_bias)
        self.W_value = nn.Linear(d_in , d_out , bias = qkv_bias)
        self.out_proj = nn.Linear(d_out , d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length , context_length),diagonal=1)
        )

        def forward(self , x):
           b , num_tokens , d_in = x.shape

           keys = self.W_key(x)
           queries = self.W_query(x)
           values = self.W_values(x)

           # we implicitly split the matrix by adding a n_heads dimension
           #unroll last dim: (b , num_tokens , d_out) -> (b,num_tokens , n_heads , head_dim)
           
           keys = keys.view(b , num_tokens , self.n_heads , self.head_dim)
           queries = queries.view(b , num_tokens , self.n_heads , self.head_dim)
           values = values.view(b , num_tokens , self.n_heads , self.head_dim)

           # Transpose: (b , num_tokens , n_heads , head_dim)-> (b, n_heads, num_tokens, head_dim)
           keys = keys.transpose(1,2)
           queries = queries.transpose(1,2)
           values = values.transpose(1,2)

           #compute scaled dot-product attention 

           attention_scores = queries@keys.transpose(2,3)  #(b , n_heads , num_tokens , num_tokens)

           # original mask truncated to the number of tokens and converted to boolean

           mask_bool = self.mask.bool()[:num_tokens , :num_tokens]

           # use the mask to fill attention scores
           attention_scores.masked_fill(mask_bool , -torch.inf)

           attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5 , dim=-1)

           attention_weights = self.dropout(attention_weights)

           # shape: (b , num_tokens , n_heads , head_dim)

           context_vec = (attention_weights@values).transpose(1,2)

           #combine heads , where d_out = self.n_heads * self.head_dim
           context_vec = context_vec.contiguous().view(b , num_tokens , self.d_out)
           context_vec = self.out_proj(context_vec)

           return context_vec

In [None]:
class LayerNorm(nn.Module):
    def __init__(self , emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self , x):
        mean = x.mean(dim=-1,keepdim=True)
        var = x.var(dim=-1,keepdim=True)
        norm_x = (x-mean)/torch.sqrt(var+self.eps) 
        return self.scale*norm_x+self.shift

    def GELU(nn.Module):
        def __init__(self):
            super().__init__()

        def forward(self , x):
            return 0.5 * x * (1+torch(
                torch.sqrt(torch.tensor(2.0/torch.pi))*
                (x+0.044715*torch.pow(x,3))
            ))     
        
class FeedForward(nn.Module):
        def __init__(self , cfg):
            super().__init__()
            self.layers = nn.Sequential(
                nn.Linear(cfg['emb_dim'] , 4*cfg['emb_dim']),
                GELU(),
                nn.Linear(4*cfg['emb_dim'],cfg['emb_dim']),
            )    

        def forward(self,x):
            return self.layers(x)    

In [None]:
class TranformerBlock(nn.Module):
    def __init__(self , cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg['emb_dim'],
            d_out = cfg['emb_dim'],
            context_length = cfg["context_length"],
            num_heads = cfg['n_heads'],
            dropout = cfg['drop_rate'],
            qkv_bias = cfg['qkv_bias']
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

    def forward(self , x):
        shortcut = x
        x =self.norm1(x)
        x = self.att(x) 
        x = x+shortcut   

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x