## RAW TEXT , FOR INPUT AND GPT2 CONFIG

In [10]:
with open("the-verdict.txt","r",encoding='utf-8') as f:
    text_data = f.read()
    
print("Total number of characters:", len(text_data))
print(text_data[:1000])    

GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}


Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its li

## WHY WE NEED BYTE PAIR ENCODING

In [11]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text_data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)


vocab = {token:integer for integer,token in enumerate(all_words)}


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [12]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    
    
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

print("============")


# OOV problem
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


KeyError: 'Hello'

In [13]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text
    

tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))    

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [14]:
! pip3 install tiktoken

import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces" "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


## WHY WE NEED DATALOADER

In [15]:
#we can do in for loops but whenever we have to do some operation for that we have to write so many conditions
#like example shuffling , droping_last , for multi processing (num_workers) 

In [18]:

import torch
from torch.utils.data import Dataset,DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,text,tokenizer,context_size,stride):
        self.input_ids =[]
        self.target_ids = []
        
        token_ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
    
        for i in range(0, len(token_ids) - context_size,stride):
        
            input_chunk = token_ids[i:i+context_size]
            target_chunk = token_ids[i+1:i+context_size+1]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)  #helpful as total size it give of dataset and user give batch size in DataLoader so we can get #of batches
    
    def __getitem__(self,idx):
        return self.input_ids[idx] , self.target_ids[idx]
    
    
def CreateDataloaderV1(text,context_size,batch_size,stride,shuffle,drop_last,num_workers):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    
    dataset = GPTDatasetV1(text,tokenizer,context_size,stride)
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
    
    return dataloader


#Example usage

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
dataloader = CreateDataloaderV1(raw_text, context_size=6, batch_size=2, stride=3, shuffle=True, drop_last=True, num_workers=0)

dataloader_iter = iter(dataloader)

first_batch = next(dataloader_iter)
print(first_batch)

inputs, targets = first_batch
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)
        

[tensor([[41793,    13,   632,   373,   465,   898],
        [  373,   645,   530,   588,   683,   438]]), tensor([[   13,   632,   373,   465,   898, 41793],
        [  645,   530,   588,   683,   438,  8807]])]
Inputs:
 tensor([[41793,    13,   632,   373,   465,   898],
        [  373,   645,   530,   588,   683,   438]])

Targets:
 tensor([[   13,   632,   373,   465,   898, 41793],
        [  645,   530,   588,   683,   438,  8807]])


## CREATE TOKEN AND POSITIONAL EMBEDDING

In [17]:
#embedding layers are just lookup tables  ,embedding_layer(input_ids) input_ids must be tensor

In [21]:
import torch
input_ids = torch.tensor([1,0,2]) #1,0,2 are token ids

torch.manual_seed(23)
vocab_size = 10 #in gpt2 vocab size is 50257
context_size = 4 #number of tokens in context in gpt2 is 1024
embed_size = 5

embedding_layer = torch.nn.Embedding(vocab_size,embed_size) #after training act as a lookup table for tokens and returns their embeddings
pos_layer = torch.nn.Embedding(context_size,embed_size)

print(f"embedding layer weights are : \n {embedding_layer.weight}\n")

print(f"embedding of input token ids : \n { embedding_layer(input_ids) }\n")

print(f"positional embedding layer weights are : \n {pos_layer.weight}\n")

print(f"positional embedding of input token ids : \n { pos_layer(input_ids) }\n")
#positional embedding is used to add positional information to the token embeddings


input_embeddings = embedding_layer(input_ids) + pos_layer(input_ids)
print(f"input embeddings with positional embeddings : \n {input_embeddings}\n")

embedding layer weights are : 
 Parameter containing:
tensor([[-0.9012,  0.5656, -0.4882,  0.7507,  0.5893],
        [-0.4552, -0.8135,  0.2670, -0.5531,  0.6016],
        [-0.9271,  0.5655, -2.4451, -0.1605,  0.1804],
        [ 2.2347, -0.6774,  0.8949,  0.9096,  0.4260],
        [ 1.2886, -0.1708, -0.8564, -0.6576, -0.2041],
        [ 0.1203, -0.6191, -0.6317, -0.5774,  0.5874],
        [ 0.1230,  0.0885, -0.8708,  1.3073, -0.1244],
        [-0.8531,  1.2268, -2.0151,  0.1955, -1.5921],
        [-1.4647, -0.4805, -0.7016,  0.1341,  1.9434],
        [ 1.0825, -1.5422,  0.6945, -0.2041, -0.6442]], requires_grad=True)

embedding of input token ids : 
 tensor([[-0.4552, -0.8135,  0.2670, -0.5531,  0.6016],
        [-0.9012,  0.5656, -0.4882,  0.7507,  0.5893],
        [-0.9271,  0.5655, -2.4451, -0.1605,  0.1804]],
       grad_fn=<EmbeddingBackward0>)

positional embedding layer weights are : 
 Parameter containing:
tensor([[ 1.1093, -0.1230, -0.6921,  1.2534, -1.5720],
        [-1.4571,

## IMPLEMENTING SIMPLIFIED ATTENTION

In [23]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [24]:

# attn_scores = torch.empty(6, 6)
# for i, x_i in enumerate(inputs):
#     for j, x_j in enumerate(inputs):
#         attn_scores[i, j] = torch.dot(x_i, x_j)

# print(attn_scores)

attn_scores = inputs @ inputs.T
print(f"attention scores are : \n {attn_scores}\n ")

attn_weights = torch.softmax(attn_scores, dim=-1)  #along last dimension , saare coloumn me jaake
print(f"attention weights are :\n {attn_weights}\n ")

all_context_vecs = attn_weights @ inputs
print(f" context vectors are :\n {all_context_vecs}\n")

attention scores are : 
 tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
 
attention weights are :
 tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
 
 context vectors are :
 tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])



## IMPLEMENTING COMPACT SELF ATTENTION CLASS

In [25]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [28]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        
        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec

'''
Additionally, a significant advantage of using nn.Linear instead of manually
implementing nn.Parameter(torch.rand(...)) is that nn.Linear has an optimized weight
initialization scheme, contributing to more stable and effective model training.
'''

class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec
    

torch.manual_seed(123)
d_in = 3  #embed dimension
d_out = 2 #output_dim
sa_v1 = SelfAttention_v1(d_in, d_out)
print(f"self attention v1 : \n {sa_v1(inputs)} \n ")

sa_v2 = SelfAttention_v2(d_in, d_out)
print(f"self attention v2 : \n {sa_v2(inputs)} \n ")

self attention v1 : 
 tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>) 
 
self attention v2 : 
 tensor([[0.5085, 0.3508],
        [0.5084, 0.3508],
        [0.5084, 0.3506],
        [0.5074, 0.3471],
        [0.5076, 0.3446],
        [0.5077, 0.3493]], grad_fn=<MmBackward0>) 
 


## IMPLEMENTING CAUSAL ATTENTION AND MULTI HEAD ATTENTION

In [29]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length,
                 dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout) # New
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New 
        '''
        The use of register_buffer in PyTorch is not strictly necessary for all use cases but offers several advantages here.
        For instance, when we use the CausalAttention class in our LLM, buffers are automatically
        moved to the appropriate device (CPU or GPU) along with our model, which will be relevant
        when training the LLM in future chapters. 

        This means we don't need to manually ensure these tensors are on the same device as your model parameters, avoiding device mismatch errors.
        '''

    def forward(self, x):
        b, num_tokens, d_in = x.shape # New batch dimension b
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1, 2) # Changed transpose
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)  # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights) # New

        context_vec = attn_weights @ values
        return context_vec
    

batch = torch.stack((inputs, inputs), dim=0)
torch.manual_seed(123)
context_length = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
print("context_vecs.shape:", context_vecs.shape)

context_vecs.shape: torch.Size([2, 6, 2])


In [31]:
class MultiHeadAttentionWrapper(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias) 
             for _ in range(num_heads)]
        )

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)
    
    
torch.manual_seed(123)
context_length = batch.shape[1] # This is the number of tokens # in the context, which is 6 in this case
d_in, d_out = 3, 2
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]],

        [[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]]], grad_fn=<CatBackward0>)
context_vecs.shape: torch.Size([2, 6, 4])


In [36]:
class MultiHeadAttention(torch.nn.Module):
    
    def __init__(self,d_in, d_out, n_heads , context_length , dropout_rate, qkv_bias = False):
        super().__init__()
        
        assert (d_out % n_heads ==0),"d_out must be divisible by num_heads"  #assert condition , message
        
        self.d_out = d_out
        self.n_heads = n_heads
        self.context_length = context_length
        self.head_dim = d_out // n_heads
        self.qkv_bias = qkv_bias
        self.scale = self.head_dim ** -0.5
        
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.W_query = torch.nn.Linear(d_in, d_out , bias = qkv_bias)
        self.W_key = torch.nn.Linear(d_in,d_out,bias = qkv_bias)
        self.W_value = torch.nn.Linear(d_in,d_out,bias = qkv_bias)
        self.out_proj = torch.nn.Linear(d_out,d_out)
        self.register_buffer("mask", torch.triu( torch.ones(context_length,context_length) ,diagonal = 1))
        # Register a buffer for the mask to ensure it is moved to the correct device during training
        
    
    def forward(self,x):
        batch_size,num_tokens,d_in = x.shape
        
        query = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)
        
        # We implicitly split the matrix by adding a `n_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        queries = query.view(batch_size, num_tokens, self.n_heads, self.head_dim)
        keys = keys.view(batch_size, num_tokens, self.n_heads, self.head_dim) 
        values = values.view(batch_size, num_tokens, self.n_heads, self.head_dim)
        
        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        
        attention_scores = queries @ keys.transpose(2,3)
        
        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        
        # Use the mask to fill attention scores
        attention_scores.masked_fill_(mask_bool, -torch.inf)
        
        attention_weights = torch.softmax(attention_scores * self.scale, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attention_weights @ values).transpose(1, 2) 
        
        # Shape: (b, num_tokens, d_out)
        context_vec = context_vec.contiguous().view(batch_size, num_tokens, self.d_out)
        
        context_vec = self.out_proj(context_vec)
        
        return context_vec
    
    
    
torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out,n_heads=2,context_length=context_length, dropout_rate=0.0,qkv_bias=False)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]],

        [[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])


## WHY SQRT(HEAD_DIM)

In [37]:
'''
Reason 1: For stability in learning
The softmax function is sensitive to the magnitudes of its inputs. When the inputs are large, the differences between the exponential values of each input become much more pronounced. This causes the softmax output to become "peaky," where the highest value receives almost all the probability mass, and the rest receive very little.
In attention mechanisms, particularly in transformers, if the dot products between query and key vectors become too large (like multiplying by 8 in this example), the attention scores can become very large. This results in a very sharp softmax distribution, making the model overly confident in one particular "key." Such sharp distributions can make learning unstable,

Reason 2: To make the variance of the dot product stable
The dot product of  Q and K increases the variance because multiplying two random numbers increases the variance.
The increase in variance grows with the dimension. 
Dividing by sqrt (dimension) keeps the variance close to 1
'''


import torch

# Define the tensor
tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

# Apply softmax without scaling
softmax_result = torch.softmax(tensor, dim=-1)
print("Softmax without scaling:", softmax_result)

# Multiply the tensor by 8 and then apply softmax
scaled_tensor = tensor * 8
softmax_scaled_result = torch.softmax(scaled_tensor, dim=-1)
print("Softmax after scaling (tensor * 8):", softmax_scaled_result)


import numpy as np

# Function to compute variance before and after scaling
def compute_variance(dim, num_trials=1000):
    dot_products = []
    scaled_dot_products = []

    # Generate multiple random vectors and compute dot products
    for _ in range(num_trials):
        q = np.random.randn(dim)
        k = np.random.randn(dim)
        
        # Compute dot product
        dot_product = np.dot(q, k)
        dot_products.append(dot_product)
        
        # Scale the dot product by sqrt(dim)
        scaled_dot_product = dot_product / np.sqrt(dim)
        scaled_dot_products.append(scaled_dot_product)
    
    # Calculate variance of the dot products
    variance_before_scaling = np.var(dot_products)
    variance_after_scaling = np.var(scaled_dot_products)

    return variance_before_scaling, variance_after_scaling

# For dimension 5
variance_before_5, variance_after_5 = compute_variance(5)
print(f"Variance before scaling (dim=5): {variance_before_5}")
print(f"Variance after scaling (dim=5): {variance_after_5}")

# For dimension 20
variance_before_100, variance_after_100 = compute_variance(100)
print(f"Variance before scaling (dim=100): {variance_before_100}")
print(f"Variance after scaling (dim=100): {variance_after_100}")



Softmax without scaling: tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
Softmax after scaling (tensor * 8): tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])
Variance before scaling (dim=5): 4.682539985279139
Variance after scaling (dim=5): 0.9365079970558279
Variance before scaling (dim=100): 105.07271679692255
Variance after scaling (dim=100): 1.0507271679692258


## WHY GELU

In [None]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])

    def forward(self, x):
        for layer in self.layers:
            # Compute the output of the current layer
            layer_output = layer(x)
            # Check if shortcut can be applied
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x
    
    
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123) # specify random seed for the initial weights for reproducibility
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)


## COUNTING NUMBER OF PARAMTERS IN GPT MODEL

In [41]:
                   
class LayerNorm(torch.nn.Module):
    
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))
        
    def forward(self,x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        
        normalized_x = (x - mean) / torch.sqrt(var + self.eps)
        
        return self.scale * normalized_x + self.shift
    
       
class FeedForward(torch.nn.Module):
    
    def __init__(self,cfg):
        super().__init__()
        
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            torch.nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )  
        
    def forward(self, x):
        return self.layers(x)
    
          
class TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attention = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            n_heads=cfg["n_heads"], 
            context_length=cfg["context_length"],
            dropout_rate =cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.dropout = torch.nn.Dropout(cfg["drop_rate"])
        
        
    def forward(self,x):
        
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = x + shortcut
        
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut
        
        return x
        

class GPTModel(torch.nn.Module):
    
    def __init__(self,cfg):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.postion_embedding = torch.nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_embedding = torch.nn.Dropout(cfg["drop_rate"])
        
        self.transformer_blocks = torch.nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = torch.nn.Linear(cfg["emb_dim"], cfg["vocab_size"],bias = False)
        
    
    def forward(self,in_idx):
        batch_size , seq_len = in_idx.shape
        token_emb = self.token_embedding(in_idx)
        pos_emb = self.postion_embedding(torch.arange(seq_len, device=in_idx.device))
        x = token_emb + pos_emb  # Shape [batch_size, num_tokens, emb_size]
        
        x = self.drop_embedding(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
    


In [43]:
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4223, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


In [44]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


In [45]:
print("Token embedding layer shape:", model.token_embedding.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])


In [46]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 124,412,160


In [47]:
total_size_bytes = total_params * 4 
total_size_mb = total_size_bytes / (1024 * 1024) 
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


## CALCULATING TEXT GENERATION LOSS, CROSS ENTROPY AND PERPLEXITY

In [48]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

with torch.no_grad():
    logits = model(inputs)

# Logits have shape (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)

# Targets have shape (batch_size, num_tokens)
print("Targets shape:", targets.shape)

logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

perplexity = torch.exp(loss)    #simply exponent of profit
print(perplexity)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])
Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])
tensor(10.8083)
tensor(49429.9805)


## WHY TEMPERATURE AND TOP K DECODING STRATEGIES ARE USED --TO CONTROL RANDOMNESS