token embedding --> positional embedding --> dropout --> layernorm --> attn --> dropout --> shortcut --> layernorm --> ff --> dropout --> shortcut --> layernorm --> linear


In [35]:
# define data and tokenizer
import tiktoken
import torch
from torch.utils.data import Dataset,DataLoader

import torch 
from torch.utils.data import Dataset,DataLoader

class MyDataset(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext>"})
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

data_path = 'the-verdict.txt'
with open(data_path,'r',encoding='utf-8') as f:
    data = f.read()
tokenizer = tiktoken.get_encoding('gpt2')
max_length= 6
stride = 2
my_data = MyDataset(data,tokenizer,max_length,stride)
my_dataloader = DataLoader(my_data,batch_size = 1,shuffle=False)
data_iter = iter(my_dataloader)
inputs,targets = next(data_iter)
print(inputs)
print(targets)

tensor([[  40,  367, 2885, 1464, 1807, 3619]])
tensor([[ 367, 2885, 1464, 1807, 3619,  402]])


### model config

In [51]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # maximum context length of GPT model
    "emb_dim": 768,         # Word Embedding Vector Dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of decoder blocks
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

### tokenization and positional encoding

In [54]:
import torch.nn as nn
import torch

print("input shape: \n",inputs.shape)   # check input

token_embedding = nn.Embedding(GPT_CONFIG_124M["vocab_size"],GPT_CONFIG_124M["emb_dim"]) # embedding layer
pos_embedding = nn.Embedding(GPT_CONFIG_124M["vocab_size"],GPT_CONFIG_124M["emb_dim"])  # positional encoding layer

word_embedding = token_embedding(inputs) #  calculate word embedding
pos_encoding = pos_embedding(torch.arange(inputs.shape[-1]))    # calculate positional embedding
print("word embedding: \n",word_embedding,word_embedding.shape)
print("positional encoding: \n",pos_encoding,pos_encoding.shape)

model_input_vec = word_embedding + pos_encoding # model input = word embedding + positional embedding
print("model inputs: \n",model_input_vec,model_input_vec.shape)

input shape: 
 torch.Size([1, 6])
word embedding: 
 tensor([[[-1.0707,  0.3267,  1.9081,  ...,  1.0112,  0.8606, -0.5563],
         [ 0.6871,  0.2658,  0.2954,  ..., -2.8304,  1.3686,  0.3000],
         [ 1.1591, -0.6588, -0.1518,  ...,  2.5768,  1.7565, -0.0242],
         [ 0.0810, -0.6320,  0.3807,  ...,  0.4548,  0.9513, -2.0836],
         [-0.3517,  1.1319,  0.0939,  ...,  1.0578, -1.2894,  0.6249],
         [ 1.2559, -0.2192,  1.6321,  ..., -0.4892, -0.3940, -1.0325]]],
       grad_fn=<EmbeddingBackward0>) torch.Size([1, 6, 768])
positional encoding: 
 tensor([[ 0.5427,  1.1450,  0.6127,  ...,  0.0265,  0.0664, -0.3141],
        [-0.2300,  0.0572,  0.4217,  ..., -0.6298,  0.7333,  2.1454],
        [ 1.0172,  0.8188, -0.9856,  ...,  0.6877,  1.6905, -0.5306],
        [ 0.7248, -0.7288, -1.4576,  ...,  1.6022, -0.9895,  0.0201],
        [ 0.0245,  1.0586,  2.1763,  ...,  0.9470,  2.6867, -0.9282],
        [ 0.9627,  0.9783,  2.6095,  ...,  0.5074,  0.1534,  0.1154]],
       grad_fn=

### Dropout

In [81]:
dropout_layer = nn.Dropout(GPT_CONFIG_124M["drop_rate"]) #  dropout layer with rate
dropout_result = dropout_layer(model_input_vec) # perform dropout, randomly set values to zeros and balance others
print("tensor after dropout: ",dropout_result,dropout_result.shape)

tensor after dropout:  tensor([[[-0.5866,  1.6352,  2.8009,  ...,  1.1530,  1.0300, -0.9671],
         [ 0.5079,  0.3588,  0.7967,  ..., -3.8446,  2.3355,  2.7171],
         [ 2.4181,  0.1777, -1.2638,  ...,  3.6271,  3.8300, -0.0000],
         [ 0.8953, -1.5120, -1.1966,  ...,  2.2855, -0.0424, -2.2928],
         [-0.3635,  0.0000,  2.5225,  ...,  2.2275,  1.5525, -0.3370],
         [ 2.4651,  0.8434,  4.7129,  ...,  0.0202, -0.2673, -1.0190]]],
       grad_fn=<MulBackward0>) torch.Size([1, 6, 768])


### LayerNorm

In [103]:
eps = 1e-5
scale = nn.Parameter(torch.ones(GPT_CONFIG_124M["emb_dim"]))
shift = nn.Parameter(torch.zeros(GPT_CONFIG_124M["emb_dim"]))

mean = dropout_result.mean(dim=-1,keepdim=True) # mean before layerNorm
var = dropout_result.var(dim=-1,keepdim=True)   # variance before LayerNorm
print("[before LayerNorm] mean :",mean)
print("[before LayerNorm] var :",torch.sqrt(var))

print("-"*50)

layernorm_result_naive = (dropout_result - mean) / torch.sqrt(var + eps)
mean = layernorm_result_naive.mean(dim=-1,keepdim=True) # mean after layerNorm
var = layernorm_result_naive.var(dim=-1,keepdim=True)   # variance after LayerNorm
print("[after LayerNorm] mean :",mean)
print("[after LayerNorm] var :",torch.sqrt(var))

layernorm_result = scale * layernorm_result_naive + shift
print("LayerNorm result: ",layernorm_result)

[before LayerNorm] mean : tensor([[[ 0.0347],
         [-0.0565],
         [-0.0055],
         [ 0.0346],
         [ 0.0132],
         [-0.0284]]], grad_fn=<MeanBackward1>)
[before LayerNorm] var : tensor([[[1.5070],
         [1.4363],
         [1.5197],
         [1.4435],
         [1.5579],
         [1.5060]]], grad_fn=<SqrtBackward0>)
--------------------------------------------------
[after LayerNorm] mean : tensor([[[-3.7253e-09],
         [ 1.4901e-08],
         [-7.4506e-09],
         [ 2.4835e-09],
         [ 1.8006e-08],
         [-7.4506e-09]]], grad_fn=<MeanBackward1>)
[after LayerNorm] var : tensor([[[1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000]]], grad_fn=<SqrtBackward0>)
LayerNorm result:  tensor([[[-0.4123,  1.0621,  1.8356,  ...,  0.7421,  0.6605, -0.6648],
         [ 0.3929,  0.2892,  0.5940,  ..., -2.6375,  1.6654,  1.9311],
         [ 1.5948,  0.1206, -0.8280,  ...,  2.3903,  2.5238,  0.0036],
         [ 0.5963

### Attn

#### Create QKV Matrix

In [105]:
W_query = nn.Linear(GPT_CONFIG_124M["emb_dim"],GPT_CONFIG_124M["emb_dim"],bias = False)
W_key = nn.Linear(GPT_CONFIG_124M["emb_dim"],GPT_CONFIG_124M["emb_dim"],bias = False)
W_value = nn.Linear(GPT_CONFIG_124M["emb_dim"],GPT_CONFIG_124M["emb_dim"],bias = False)
print("query matrix: ",W_query.weight.shape)
print("key matrix: ",W_key.weight.shape)
print("value matrix: ",W_value.weight.shape)

query matrix:  torch.Size([768, 768])
key matrix:  torch.Size([768, 768])
value matrix:  torch.Size([768, 768])


#### Calculate Queries\Keys\Values

In [112]:
queries = W_query(layernorm_result)
keys = W_key(layernorm_result)
values = W_value(layernorm_result)
print("queries: ",queries, queries.shape)
print("keys: ",keys, keys.shape)
print("values: ",values, values.shape)

queries:  tensor([[[ 0.1639, -0.0128,  1.1365,  ..., -0.4442,  0.6329,  0.2322],
         [ 0.1386, -1.2747,  0.1236,  ...,  0.2472,  0.0772,  0.3138],
         [ 0.2302, -0.3563, -0.0912,  ...,  0.1018, -0.1375,  0.6736],
         [ 0.4807,  0.0962, -0.3384,  ...,  0.3566, -0.1532,  0.7028],
         [ 0.3144, -0.2109, -0.5028,  ...,  0.1917,  0.2838, -0.0147],
         [-0.1928, -0.0655,  1.1370,  ..., -0.7486, -0.2607, -0.0457]]],
       grad_fn=<UnsafeViewBackward0>) torch.Size([1, 6, 768])
keys:  tensor([[[-0.7292, -0.1690, -0.6514,  ...,  0.1623,  1.3654,  0.3343],
         [-0.9370, -0.6115, -0.0836,  ...,  0.8645, -0.8325,  0.5921],
         [ 0.9072, -0.8403,  0.2915,  ...,  0.4051, -0.9864, -0.1795],
         [ 0.3510,  1.0415,  1.2674,  ...,  0.1242,  0.1878, -0.9540],
         [ 0.4275, -0.8959, -0.2941,  ...,  0.4463, -0.3388, -0.1343],
         [-0.1989, -0.0279, -0.1887,  ...,  0.5720,  0.5910, -0.8605]]],
       grad_fn=<UnsafeViewBackward0>) torch.Size([1, 6, 768])
val

#### Break MultiHead

In [154]:
b,num_tokens,d_in = layernorm_result.shape
head_dim = GPT_CONFIG_124M["emb_dim"] // GPT_CONFIG_124M["n_heads"]
print('number of heads: ',head_dim)
multi_head_queries = queries.view(b,num_tokens,GPT_CONFIG_124M["n_heads"],head_dim)
multi_head_keys = keys.view(b,num_tokens,GPT_CONFIG_124M["n_heads"],head_dim)
multi_head_values = values.view(b,num_tokens,GPT_CONFIG_124M["n_heads"],head_dim)
multi_head_queries_result = multi_head_queries.transpose(1,2)
multi_head_keys_result = multi_head_keys.transpose(1,2)
multi_head_values_result = multi_head_values.transpose(1,2)
print("multiheaded queries: ",multi_head_queries_result.shape)
print("multiheaded keys: ",multi_head_keys_result.shape)
print("multiheaded values: ",multi_head_values_result.shape)

number of heads:  64
multiheaded queries:  torch.Size([1, 12, 6, 64])
multiheaded keys:  torch.Size([1, 12, 6, 64])
multiheaded values:  torch.Size([1, 12, 6, 64])


#### Calculate Attention Scores

In [155]:
attn_scores = multi_head_queries_result @ multi_head_keys_result.transpose(2,3)
attn_scores.shape

torch.Size([1, 12, 6, 6])

#### Attention Mask

In [137]:
num_tokens

6

In [156]:
mask = torch.triu(torch.ones(num_tokens,num_tokens)*-torch.inf,diagonal=1)
print("mask: ",mask)
masked_attn_scores = attn_scores + mask
print("masked attn score: ",masked_attn_scores)

mask:  tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]])
masked attn score:  tensor([[[[ 3.0475,    -inf,    -inf,    -inf,    -inf,    -inf],
          [ 3.4754,  2.5189,    -inf,    -inf,    -inf,    -inf],
          [ 4.1919, -2.4694,  2.5810,    -inf,    -inf,    -inf],
          [ 0.1487, -0.0587,  2.1706, -0.3441,    -inf,    -inf],
          [ 2.5241,  0.3745, -1.3315, -0.8973,  2.1271,    -inf],
          [ 0.4392,  4.3637,  3.0011,  2.8425, -1.1724, -2.5302]],

         [[-1.0943,    -inf,    -inf,    -inf,    -inf,    -inf],
          [ 1.1004, -0.2777,    -inf,    -inf,    -inf,    -inf],
          [-0.6079,  1.8770,  3.7414,    -inf,    -inf,    -inf],
          [-0.3304, -1.6215, -0.5770,  3.3490,    -inf,    -inf],
          [-0.4202, -1.1391,  0.8044, -1.0697,  2.0190,    -inf],
          

#### Calculate Attention Weights

In [157]:
masked_attn_weights = torch.softmax(masked_attn_scores / GPT_CONFIG_124M["n_heads"]**0.5,dim=-1)
masked_attn_weights.shape

torch.Size([1, 12, 6, 6])

#### Calculate Weighted Value of each token in the sequence
#### Merging MultiHeads Together 

In [161]:
context_vec_result = (masked_attn_weights @ multi_head_values_result).transpose(1,2).contiguous()
context_vec_result_summing_head = context_vec_result.view(b,num_tokens,GPT_CONFIG_124M["emb_dim"])
context_vec_result_summing_head.shape

torch.Size([1, 6, 768])

In [163]:
output_projection = nn.Linear(GPT_CONFIG_124M["emb_dim"],GPT_CONFIG_124M["emb_dim"])
multihead_output = output_projection(context_vec_result_summing_head)
print("multihead attention output: ",multihead_output,multihead_output.shape)

multihead attention output:  tensor([[[ 0.1952,  0.5433,  0.4261,  ...,  0.8400,  0.1059, -0.1498],
         [ 0.1407,  0.1945,  0.0678,  ...,  0.6771, -0.1707, -0.3266],
         [ 0.2719,  0.2131, -0.1079,  ...,  0.5154, -0.0534, -0.1696],
         [ 0.0587, -0.2183, -0.2088,  ...,  0.0873,  0.0508, -0.0040],
         [ 0.1689, -0.1435, -0.1604,  ..., -0.1684,  0.1032, -0.2286],
         [ 0.1441, -0.0958, -0.0515,  ...,  0.0512,  0.0426,  0.0291]]],
       grad_fn=<ViewBackward0>) torch.Size([1, 6, 768])


#### Dropout

In [164]:
dropout_layer_2 = nn.Dropout(GPT_CONFIG_124M["drop_rate"])
dropout_result_2 = dropout_layer_2(multihead_output)
print("dropout result 2: ",dropout_result_2,dropout_result_2.shape)

dropout result 2:  tensor([[[ 0.2169,  0.6036,  0.4734,  ...,  0.9333,  0.1176, -0.1664],
         [ 0.1563,  0.2161,  0.0753,  ...,  0.7523, -0.1897, -0.3629],
         [ 0.3021,  0.2368, -0.0000,  ...,  0.5727, -0.0000, -0.1885],
         [ 0.0652, -0.2426, -0.2320,  ...,  0.0970,  0.0564, -0.0044],
         [ 0.1876, -0.1594, -0.1783,  ..., -0.1871,  0.1146, -0.2541],
         [ 0.1601, -0.1064, -0.0572,  ...,  0.0568,  0.0473,  0.0324]]],
       grad_fn=<MulBackward0>) torch.Size([1, 6, 768])


#### Shortcut

In [165]:
shortcut_result = dropout_result + dropout_result_2
print("shortcut result:", shortcut_result,shortcut_result.shape)

shortcut result: tensor([[[-0.3697,  2.2389,  3.2743,  ...,  2.0863,  1.1476, -1.1335],
         [ 0.6642,  0.5750,  0.8720,  ..., -3.0923,  2.1458,  2.3542],
         [ 2.7203,  0.4146, -1.2638,  ...,  4.1998,  3.8300, -0.1885],
         [ 0.9605, -1.7545, -1.4286,  ...,  2.3825,  0.0140, -2.2972],
         [-0.1759, -0.1594,  2.3442,  ...,  2.0404,  1.6672, -0.5910],
         [ 2.6251,  0.7370,  4.6556,  ...,  0.0771, -0.2200, -0.9866]]],
       grad_fn=<AddBackward0>) torch.Size([1, 6, 768])


#### LayerNorm 2

In [166]:
eps = 1e-5
scale_2 = nn.Parameter(torch.ones(GPT_CONFIG_124M["emb_dim"]))
shift_2 = nn.Parameter(torch.zeros(GPT_CONFIG_124M["emb_dim"]))

mean = shortcut_result.mean(dim=-1,keepdim=True) # mean before layerNorm
var = shortcut_result.var(dim=-1,keepdim=True)   # variance before LayerNorm
print("[before LayerNorm] mean :",mean)
print("[before LayerNorm] var :",torch.sqrt(var))

print("-"*50)

layernorm_result_naive_2 = (shortcut_result - mean) / torch.sqrt(var + eps)
mean = layernorm_result_naive_2.mean(dim=-1,keepdim=True) # mean after layerNorm
var = layernorm_result_naive_2.var(dim=-1,keepdim=True)   # variance after LayerNorm
print("[after LayerNorm] mean :",mean)
print("[after LayerNorm] var :",torch.sqrt(var))

layernorm_result_2 = scale_2 * layernorm_result_naive_2 + shift_2
print("LayerNorm result: ",layernorm_result_2)

[before LayerNorm] mean : tensor([[[ 0.0311],
         [-0.0570],
         [-0.0058],
         [ 0.0490],
         [ 0.0211],
         [-0.0200]]], grad_fn=<MeanBackward1>)
[before LayerNorm] var : tensor([[[1.5284],
         [1.4403],
         [1.5326],
         [1.4480],
         [1.5775],
         [1.5077]]], grad_fn=<SqrtBackward0>)
--------------------------------------------------
[after LayerNorm] mean : tensor([[[-6.2088e-09],
         [-8.0715e-09],
         [ 8.6923e-09],
         [-1.2418e-09],
         [-6.2088e-10],
         [ 7.4506e-09]]], grad_fn=<MeanBackward1>)
[after LayerNorm] var : tensor([[[1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000]]], grad_fn=<SqrtBackward0>)
LayerNorm result:  tensor([[[-0.2622,  1.4445,  2.1220,  ...,  1.3447,  0.7305, -0.7620],
         [ 0.5007,  0.4388,  0.6450,  ..., -2.1074,  1.5294,  1.6741],
         [ 1.7787,  0.2743, -0.8208,  ...,  2.7441,  2.5028, -0.1192],
         [ 0.6295

In [167]:
def gelu(x):
    return 0.5 * x * (1 + torch.tanh(
                torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
                (x + 0.044715 * torch.pow(x, 3))
            ))
ff_1 = nn.Linear(GPT_CONFIG_124M["emb_dim"],4*GPT_CONFIG_124M["emb_dim"])
ff_2 = nn.Linear(4 * GPT_CONFIG_124M["emb_dim"],GPT_CONFIG_124M["emb_dim"])
ff_1_result = ff_1(layernorm_result_2)
print("first linear layer result: ",ff_1_result,ff_1_result.shape)
gelu_result = gelu(ff_1_result)
print("gelu result: ",gelu_result,gelu_result.shape)
ff_2_result = ff_2(gelu_result)
print("second linear layer result: ",ff_2_result,ff_2_result.shape)

first linear layer result:  tensor([[[ 0.1646, -1.8029,  0.1590,  ..., -1.1246, -0.3815,  0.3156],
         [-0.8553, -0.3036,  0.3285,  ..., -0.5555, -0.3560,  0.2702],
         [ 0.1294, -0.3083, -0.3381,  ..., -0.6389, -0.5909, -0.3548],
         [-0.9150,  0.8642,  0.2736,  ..., -0.1818, -0.1741,  0.7490],
         [ 1.0397,  1.2555, -0.7647,  ..., -0.1143,  0.2238, -0.4018],
         [ 0.5079,  0.7150, -1.6059,  ..., -0.1364,  0.0825, -0.8098]]],
       grad_fn=<ViewBackward0>) torch.Size([1, 6, 3072])
gelu result:  tensor([[[ 0.0931, -0.0644,  0.0895,  ..., -0.1468, -0.1341,  0.1969],
         [-0.1679, -0.1156,  0.2065,  ..., -0.1607, -0.1285,  0.1639],
         [ 0.0714, -0.1168, -0.1243,  ..., -0.1671, -0.1639, -0.1282],
         [-0.1649,  0.6967,  0.1663,  ..., -0.0778, -0.0750,  0.5789],
         [ 0.8843,  1.1239, -0.1700,  ..., -0.0520,  0.1317, -0.1382],
         [ 0.3526,  0.5453, -0.0871,  ..., -0.0608,  0.0440, -0.1694]]],
       grad_fn=<MulBackward0>) torch.Size([1,

#### Dropout

In [168]:
dropout_layer_3 = nn.Dropout(GPT_CONFIG_124M["drop_rate"])
dropout_result_3 = dropout_layer_3(ff_2_result)
print("Dropout3 result: ",dropout_result_3,dropout_result_3.shape)

Dropout3 result:  tensor([[[ 0.6224,  0.0893, -0.1734,  ...,  0.4368, -0.4262, -0.1677],
         [ 0.2062, -0.1888,  0.4231,  ...,  0.1842,  0.3557,  0.1586],
         [ 0.3916, -0.0821,  0.1266,  ...,  0.1601, -0.5914, -0.0166],
         [ 0.0000,  0.0856,  0.0315,  ...,  0.1568, -0.1555, -0.0000],
         [ 0.0000,  0.1601,  0.2183,  ...,  0.1335, -0.2331,  0.0556],
         [ 0.4867, -0.0463,  0.0454,  ..., -0.0857,  0.2614, -0.1990]]],
       grad_fn=<MulBackward0>) torch.Size([1, 6, 768])


#### shortcut

In [169]:
shortcut_result_2 = shortcut_result + dropout_result_3
print("shortcut result 2:",shortcut_result_2,shortcut_result_2.shape)

shortcut result 2: tensor([[[ 2.5274e-01,  2.3281e+00,  3.1009e+00,  ...,  2.5231e+00,
           7.2147e-01, -1.3012e+00],
         [ 8.7043e-01,  3.8618e-01,  1.2952e+00,  ..., -2.9081e+00,
           2.5015e+00,  2.5128e+00],
         [ 3.1119e+00,  3.3247e-01, -1.1371e+00,  ...,  4.3599e+00,
           3.2386e+00, -2.0509e-01],
         [ 9.6054e-01, -1.6690e+00, -1.3971e+00,  ...,  2.5393e+00,
          -1.4153e-01, -2.2972e+00],
         [-1.7592e-01,  6.4646e-04,  2.5625e+00,  ...,  2.1739e+00,
           1.4341e+00, -5.3538e-01],
         [ 3.1118e+00,  6.9061e-01,  4.7010e+00,  ..., -8.6216e-03,
           4.1355e-02, -1.1856e+00]]], grad_fn=<AddBackward0>) torch.Size([1, 6, 768])


### Final Norm

In [171]:
eps = 1e-5
scale_final = nn.Parameter(torch.ones(GPT_CONFIG_124M["emb_dim"]))
shift_final = nn.Parameter(torch.zeros(GPT_CONFIG_124M["emb_dim"]))

mean = shortcut_result_2.mean(dim=-1,keepdim=True) # mean before layerNorm
var = shortcut_result_2.var(dim=-1,keepdim=True)   # variance before LayerNorm
print("[before LayerNorm] mean :",mean)
print("[before LayerNorm] var :",torch.sqrt(var))

print("-"*50)

layernorm_result_naive_final = (shortcut_result_2 - mean) / torch.sqrt(var + eps)
mean = layernorm_result_naive_final.mean(dim=-1,keepdim=True) # mean after layerNorm
var = layernorm_result_naive_final.var(dim=-1,keepdim=True)   # variance after LayerNorm
print("[after LayerNorm] mean :",mean)
print("[after LayerNorm] var :",torch.sqrt(var))

layernorm_result_final = scale_final * layernorm_result_naive_final + shift_final
print("LayerNorm result: ",layernorm_result_final,layernorm_result_final.shape)

[before LayerNorm] mean : tensor([[[ 0.0259],
         [-0.0576],
         [-0.0041],
         [ 0.0461],
         [ 0.0246],
         [-0.0275]]], grad_fn=<MeanBackward1>)
[before LayerNorm] var : tensor([[[1.5469],
         [1.4535],
         [1.5433],
         [1.4591],
         [1.6011],
         [1.5157]]], grad_fn=<SqrtBackward0>)
--------------------------------------------------
[after LayerNorm] mean : tensor([[[-1.2418e-08],
         [-9.9341e-09],
         [ 3.7253e-09],
         [-2.4835e-09],
         [ 4.3462e-09],
         [ 2.4835e-09]]], grad_fn=<MeanBackward1>)
[after LayerNorm] var : tensor([[[1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000],
         [1.0000]]], grad_fn=<SqrtBackward0>)
LayerNorm result:  tensor([[[ 0.1466,  1.4883,  1.9879,  ...,  1.6144,  0.4497, -0.8579],
         [ 0.6385,  0.3053,  0.9307,  ..., -1.9612,  1.7607,  1.7685],
         [ 2.0190,  0.2181, -0.7342,  ...,  2.8277,  2.1011, -0.1303],
         [ 0.6267

### Output Layer: project result to vocab size logits

In [172]:
output_proj = nn.Linear(GPT_CONFIG_124M["emb_dim"],GPT_CONFIG_124M["vocab_size"],bias=False)
model_output = output_proj(layernorm_result_final)
print("model output: ",model_output,model_output.shape)
# up to here the original sequence has been shifted by model and predict one new token. The output has same seq length with input, except the .

model output:  tensor([[[-0.0912, -0.4733,  0.5195,  ..., -0.7332, -1.0550, -0.8636],
         [ 0.4925, -0.0938, -0.1721,  ..., -0.0998,  0.2614,  0.0258],
         [ 0.0129, -0.1933,  0.3204,  ...,  0.1586, -0.4201,  0.3723],
         [ 1.1929, -0.1604,  1.0917,  ...,  0.1996,  0.8766, -1.6754],
         [-0.9656,  0.0564,  0.1322,  ...,  0.4414, -0.0039, -0.1358],
         [ 0.2202, -0.3058,  0.5253,  ...,  0.1822, -0.0514, -0.4342]]],
       grad_fn=<UnsafeViewBackward0>) torch.Size([1, 6, 50257])


In [177]:
logits = model_output[:,-1,:] # get the last token in seq
logits

tensor([[ 0.2202, -0.3058,  0.5253,  ...,  0.1822, -0.0514, -0.4342]],
       grad_fn=<SliceBackward0>)

In [187]:
probas = torch.softmax(logits,dim=-1)
idx_next = torch.argmax(probas,dim=-1,keepdim=True)
tokenizer.decode(idx_next.detach().numpy().tolist()[0])

'zzi'

In [188]:
tokenizer.decode_batch(inputs.detach().numpy().tolist()+idx_next.detach().numpy().tolist())

['I HAD always thought Jack', 'zzi']

In [189]:
tokenizer.decode_batch(targets.detach().numpy().tolist())

[' HAD always thought Jack G']