In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
    
}

In [86]:
import torch
from torch import nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'],cfg['emb_dim'])        
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        
        #Use a placeholder for transformerBlock
        self.trf_blocks = nn.Sequential(
        *[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )
        
        #Use a Placeholder for layerNorm
        self.final_norm = DummyLayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(
        cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
        
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        X = tok_embeds + pos_embeds
#         print(X.shape)
        x = self.drop_emb(X)
        x = self.trf_blocks(x)
        
        x = self.final_norm(x)
        
        logits = self.out_head(x)
#         print(logits.shahen pe)
        return logits
    
    
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        
    def forward(self, x):
        return x
    
    
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape,eps=1e-5):
        super().__init__()
        
    def forward(self, x):
        return x

In [88]:
x= DummyGPTModel(GPT_CONFIG_124M)
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch,dim=0)
x(batch)

torch.Size([2, 4, 768])
torch.Size([2, 4, 50257])


tensor([[[-0.0770, -0.5369, -0.6501,  ...,  0.4394,  0.3079,  1.2629],
         [ 1.2624, -0.7648,  0.0168,  ..., -0.3997,  0.2210,  0.2918],
         [ 0.4756,  0.0161, -1.4591,  ...,  1.1067,  0.0520, -1.1703],
         [ 2.1591,  0.5228, -0.0922,  ...,  1.3586, -0.2436, -0.3894]],

        [[-0.9236, -1.1660, -0.2661,  ...,  0.5079,  0.0575,  0.6692],
         [ 0.3894, -0.9877, -0.4502,  ..., -0.4546,  1.0589, -0.2741],
         [ 0.7883, -0.5361, -0.3320,  ...,  1.3431,  0.2862, -0.3871],
         [ 1.1517,  0.9609,  0.8355,  ...,  1.8312, -0.6781,  0.2668]]],
       grad_fn=<UnsafeViewBackward0>)

In [76]:
temp = nn.Embedding(50257,768)

In [79]:
temp(torch.tensor(batch))

  temp(torch.tensor(batch))


tensor([[[-0.8447,  0.9716, -1.1744,  ...,  0.6664,  0.4375,  0.0322],
         [-0.7639, -0.0385, -0.9119,  ..., -0.0111,  0.5014, -0.1699],
         [ 0.4867,  0.3748, -0.1322,  ...,  1.3440, -2.5451,  0.3944],
         [ 0.3643,  0.3740,  0.4665,  ...,  2.4661, -0.5134, -0.2861]],

        [[-0.8447,  0.9716, -1.1744,  ...,  0.6664,  0.4375,  0.0322],
         [-1.2707, -1.7676, -0.1824,  ..., -0.2419, -0.8283,  0.7734],
         [ 0.6793,  0.4326,  0.1375,  ..., -1.6505, -0.1433, -0.1632],
         [ 1.9008,  0.7583,  0.7721,  ..., -0.3160, -1.2656,  0.4164]]],
       grad_fn=<EmbeddingBackward0>)

In [80]:
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [89]:
#LAyer Normalization

In [100]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
out = layer(batch_example)

In [102]:
out

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [106]:
out[0].mean()/out[0].std()

tensor(0.8719, grad_fn=<DivBackward0>)

In [112]:
mean = out.mean(dim=-1,keepdim=True)
var = out.var(dim=-1,keepdim=True)
print("Mean: ",mean)
print("Variance: ",var)

Mean:  tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:  tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [145]:
out_norm = (out-mean)/torch.sqrt(var)
mean = out_norm.mean(dim=-1,keepdim=True)
var = out_norm.var(dim=-1,keepdim=True)
print("Normalized layer outputs: ",out_norm)
print("Mean: ",mean)
print("Var: ",var)


Normalized layer outputs:  tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:  tensor([[-1.1921e-07],
        [-3.7750e-07]], grad_fn=<MeanBackward1>)
Var:  tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [138]:
out_norm.mean(dim=-1, keepdim=True)

tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)

In [146]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))        
        
    def forward(self, x):
        mean = x.mean(dim=-1,keepdim=True)
        std = x.std(dim=-1,keepdim=True)        
        norm_x = (x-mean)/(std + self.eps)
        return self.scale * norm_x + self.shift

In [147]:
x = LayerNorm(10)

In [149]:
x(torch.randn(10))

tensor([-0.8988, -1.0755,  0.8301, -1.1406,  0.2874, -1.0125,  0.5258,  1.7338,
        -0.0569,  0.8072], grad_fn=<AddBackward0>)

In [165]:
import torch
from torch import nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'],cfg['emb_dim'])        
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        
        #Use a placeholder for transformerBlock
        self.trf_blocks = nn.Sequential(
        *[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )
        
        #Use a Placeholder for layerNorm
        self.final_norm = DummyLayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(
        cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
        
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        X = tok_embeds + pos_embeds
#         print(X.shape)
        x = self.drop_emb(X)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        
        logits = self.out_head(x)
#         print(logits.shahen pe)
        return logits
    
    
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        
    def forward(self, x):
        return x
    
    
class DummyLayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [166]:
x = DummyGPTModel(GPT_CONFIG_124M)

In [167]:
x(batch)

tensor([[[ 0.2204,  0.1094, -0.8397,  ..., -0.3039, -0.2998,  0.7156],
         [-0.5685, -1.0897,  0.9066,  ..., -0.0817, -0.9036,  0.8595],
         [-0.8813, -0.1679, -0.1316,  ..., -0.0539, -0.3489,  0.1407],
         [-0.4067, -1.1791,  0.3187,  ...,  0.9100,  0.0186,  0.0661]],

        [[-0.0232, -0.2497, -0.6211,  ..., -0.0812, -0.1655,  0.6932],
         [-0.9943, -1.2283, -0.5148,  ...,  0.4684, -1.3461, -0.5234],
         [ 0.6375, -0.2633, -0.0044,  ..., -0.2200,  0.6147,  0.7000],
         [-0.4192,  0.0057,  0.1095,  ...,  0.0214,  0.1416,  0.0639]]],
       grad_fn=<UnsafeViewBackward0>)