In [2]:
# Rotary position embedding 
# instead of calculating the pe, rotate the key and query vectors 
#This is done by splitting the embedding dimension into pairs, treating each pair as a complex number, and multiplying it by a rotation factor e imθ
#The angle of rotation depends on the token's position m. 
# this is done to handle the relative positions in the  embeddings, so that the context length can be increased more than the max seq length 
import torch 
dim = 20
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
inv_freq

tensor([1.0000e+00, 3.9811e-01, 1.5849e-01, 6.3096e-02, 2.5119e-02, 1.0000e-02,
        3.9811e-03, 1.5849e-03, 6.3096e-04, 2.5119e-04])

In [53]:
# Parameter-Efficient Fine-Tuning (LoRA)
import torch 
import torch.nn as nn
in_features = 128 
out_features = 256 
original_linear = nn.Linear(in_features=128, out_features=256)
rank = 4 
alpha = 8 
scale  = alpha/rank
# we define two layers for A and B
A = nn.Linear(in_features, rank)
B = nn.Linear(rank, out_features)
# B has zero weights 
nn.init.zeros_(B.weight)
x = torch.randn(1,128)
output = original_linear(x) + B(A(x))*scale
# full finetuning layer weights 
finetuning_parameters = 0 
for x in original_linear.parameters():
    finetuning_parameters+= x.numel()
print("If fullfinetuing layer number of parameters: ", finetuning_parameters)
new_param =0 
for x in B.parameters():
    new_param+= x.numel()
for x in A.parameters():
    new_param+= x.numel()
print("For Lora layer number of parameters: ", new_param)


If fullfinetuing layer number of parameters:  33024
For Lora layer number of parameters:  1796


In [140]:
# Fast Attention 
# imporve the for loops in the Multihrad attention and imporve the for loops in the n_layers 
# A for loop in the forward pass is a performance bottleneck because it processes each head sequentially instead of in parallel on the GPU.
import torch 
import math 
def scaled_dot_product_attention(Q, K, V, mask ):
    # the shape of QKV is batch_size, num_heads, seq_len, dk 
    dot_product = torch.matmul(Q, K.transpose(-1, -2)) 
    # dot product shape is batch_size, num_heads, seq_len, seq_len
    dk = Q.shape[-1]
    scaled_dot_product = dot_product/ math.sqrt(dk)
    
    if mask is not None:
        scaled_dot_product = torch.masked_fill(scaled_dot_product,mask==0, value=float("-inf"))
    scaled_dot_product = torch.softmax(scaled_dot_product, dim=-1)
    attention_scores = torch.matmul(scaled_dot_product, V)
    return attention_scores, scaled_dot_product

class FastMultiHeadAttention(torch.nn.Module):
    def __init__(self, num_heads, embed_size):
        super().__init__()
        self.num_heads = num_heads 
        self.embed_size = embed_size 
        self.dk = embed_size// num_heads
        # for parallel ocmputaion we create a. single layer of qkv matrices instead of 3 
        self.qkv = nn.Linear(embed_size, 3*embed_size)# we usually dont have a bias here 
        self.output_layer = nn.Linear(embed_size, embed_size)
    def forward(self,x, mask):
        # x shape is batch_size, seq_len, embed_size
        qkv = self.qkv(x) 
        batch_size = x.shape[0]
        # so this computation is getting parallely executed on gpu 
        # earlier we used to do q = nn.linear(x) v = nn.linear(x) whihc brings the series computations
        # qkv will be  batch_size, seq_len, 3*embed_size 
        Q, K , V = torch.chunk(qkv, chunks=3, dim = -1)
        # now we can proceed wiht normal scaled dot product attention 
        # make parallel across heads 
        Q = Q.view(batch_size, -1,self.num_heads , self.dk ).transpose(1,2)
        V = V.view(batch_size, -1,self.num_heads , self.dk ).transpose(1,2)
        K = K.view(batch_size, -1,self.num_heads , self.dk ).transpose(1,2)

        attention_output,_ = scaled_dot_product_attention(Q, K, V, mask)
        # shape is batch_size, num_heads, seq_len, dk 
        attention_output = attention_output.transpose(1,2)
        attention_output = attention_output.contiguous().view(batch_size,-1, self.embed_size )
        # we need conitguous because the transpose makes the memory non contigous??
        output = self.output_layer(attention_output)
        
        return output
batch_size = 4; seq_len = 10 ; embedding_dim = 20
model = FastMultiHeadAttention(num_heads= 4, embed_size=embedding_dim)
input_tensor = torch.rand(batch_size, seq_len, embedding_dim)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)
print(mask)
print(f"Mask shape: {mask.shape}")
output = model(input_tensor, mask=mask)
print(output.shape)
#decoder = DecoderOnlyBlock(embedding_dim, num_heads, 4*embedding_dim, 0.9)
# batch_size = 4; seq_len = 10 ; embedding_dim = 20; num_heads = 4 


tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
Mask shape: torch.Size([1, 1, 10, 10])
torch.Size([4, 10, 20])


In [141]:
# we can make the decoder only attention block usign this attention module 
class DecoderOnlyBlock(nn.Module):

    def __init__(self, embedding_dim, num_heads, dff, dropout):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.dff = dff 
        self.dropout = dropout 
        self.ffn = nn.Sequential(nn.Linear(embedding_dim, dff), nn.ReLU(), nn.Linear(dff,embedding_dim))
        self.attention = FastMultiHeadAttention(num_heads= 4, embed_size=embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.layernorm1 = nn.LayerNorm(embedding_dim)
        self.layernorm2 = nn.LayerNorm(embedding_dim)
    def forward(self , x, mask ):
        # assuming x is hte embedding here, batch_size, seq_len, embed_dim 
        # mask is 1, 1,, seq_len, seq_len
        residual = x 
        attention_output = self.attention(x, mask)
        
        x = residual+ self.layernorm1(attention_output)
        residual = x 
        FFN_output = self.ffn(x)
        FFN_output = self.layernorm2(FFN_output)
        x = residual + self.dropout(FFN_output)
        return x 
batch_size = 4; seq_len = 10 ; embedding_dim = 20
input_tensor = torch.rand(batch_size, seq_len, embedding_dim)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)
print(mask)
num_heads = 4 
print(f"Mask shape: {mask.shape}")
decoder = DecoderOnlyBlock(embedding_dim,num_heads , 4*embedding_dim, 0.9)
batch_size = 4; seq_len = 10 ; embedding_dim = 20; num_heads = 4 
attention_block_output = decoder(input_tensor, mask )
print(attention_block_output.shape)


tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
Mask shape: torch.Size([1, 1, 10, 10])
torch.Size([4, 10, 20])


In [166]:
# make the full gpt module now 
class GPT(nn.Module): 
    def __init__(self, embed_size, max_seq_len, vocab_size, num_heads, num_layers ):
        super().__init__()
        self.embed_size = embed_size
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_layers = num_layers 
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # nn.Embedding needs a int 64 inputs Because nn.Embedding is a lookup table, not a linear layer.
        self.positional_embedding = nn.Embedding(self.max_seq_len, self.embed_size) #lets make this learnable as well
        self.attention_blocks = nn.ModuleList([DecoderOnlyBlock(embedding_dim=embed_size, num_heads=num_heads, 
                                                                dff = 4*embed_size, dropout=0.9) 
                                                                for _ in range(self.num_layers)])
        self.lm_head  = nn.Linear(self.embed_size, self.vocab_size)
        self.layernorm = nn.LayerNorm(self.embed_size)
        
    def forward(self, x):
        # expect x to eb the pretraining data wiht batch size, seq_len 
        batch_size, seq_len = x.shape
        # call the embedding and ocmibine with positional embedding 
        token_embedding = self.embedding(x) # batch_size, seq_len, embed_size
    
        pos_indices = torch.arange(seq_len)
        positional_embedding = self.positional_embedding(pos_indices)
        x = token_embedding+ positional_embedding
        
        # shape batch_size, seq_len,embed_size
        # now we can pass this through the attention modules with the mask 
        mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)
         
        # 1, 1, max_seq, max_seq
        # make the causal maask 
        for layer in self.attention_blocks:
            x = layer(x, mask)
            # x same as output batch_size, max_seq, embed_size
        output = self.layernorm(x)
        # otuput shape batch, seq
        y = self.lm_head(output)
        return y # here y is the last token prob
    # lets add a generate call for this 
    # this is the decoding step which calls the froward and samples the token from this recursively 
    # this is the inference code 
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature = 1.0):
        self.eval()
        #idx shape is previous tokens processsed so (batch_size x seq_len )
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.max_seq_len:]
            logits = self.forward(idx_cond) # batch_size, seq_len, vocab_size
            
            # get logits from the last time step last seq token basically 
            last_token_logits = logits[:, -1, :] # b , vocab size 
            
            last_token_logits = last_token_logits / temperature
            last_token_probs = torch.softmax(last_token_logits, dim= -1) # the softmax is over all the vocab 
            # sampple the next token torch.multinomial samples an index based on the weights (probabilities)
            last_token = torch.multinomial(last_token_probs, num_samples=1) # B x1 
            # print("logits", logits.shape)
            # print(last_token.shape)
            idx = torch.cat((idx, last_token), dim = 1)
            # B, S+1
        return idx
vocab_size = 1000
embedding_dim = 20
max_seq_len = 100
num_heads = 4
num_layers = 6 
batch_size = 4; seq_len = 10 ; embedding_dim = 20
model = GPT(embed_size= embedding_dim, max_seq_len=max_seq_len, vocab_size=vocab_size,num_heads=num_heads,num_layers = num_layers)
embed_size = embedding_dim
input_tensor = torch.randint(0, vocab_size, (batch_size,10)) # (batch, seq_len)
# # pass the input as ranint rather than rand 
output = model.generate(input_tensor, 20 )
print(input_tensor.shape)
print(output.shape)

torch.Size([4, 10])
torch.Size([4, 30])


In [None]:
# add the loss to calculate cross entropy in this case 
class GPT(nn.Module): 
    def __init__(self, embed_size, max_seq_len, vocab_size, num_heads, num_layers ):
        super().__init__()
        self.embed_size = embed_size
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_layers = num_layers 
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # nn.Embedding needs a int 64 inputs Because nn.Embedding is a lookup table, not a linear layer.
        self.positional_embedding = nn.Embedding(self.max_seq_len, self.embed_size) #lets make this learnable as well
        self.attention_blocks = nn.ModuleList([DecoderOnlyBlock(embedding_dim=embed_size, num_heads=num_heads, 
                                                                dff = 4*embed_size, dropout=0.9) 
                                                                for _ in range(self.num_layers)])
        self.lm_head  = nn.Linear(self.embed_size, self.vocab_size)
        self.layernorm = nn.LayerNorm(self.embed_size)
        
    def forward(self, x, targets=None):
        ## FORWARD SHOULD RETURN THE LOSS HERE SO THAT WE CAN DO BACKPROP 
        # expect x to eb the pretraining data wiht batch size, seq_len 
        batch_size, seq_len = x.shape
        # call the embedding and ocmibine with positional embedding 
        token_embedding = self.embedding(x) # batch_size, seq_len, embed_size
    
        pos_indices = torch.arange(seq_len)
        positional_embedding = self.positional_embedding(pos_indices)
        x = token_embedding+ positional_embedding
        
        # shape batch_size, seq_len,embed_size
        # now we can pass this through the attention modules with the mask 
        mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)
         
        # 1, 1, max_seq, max_seq
        # make the causal maask 
        for layer in self.attention_blocks:
            x = layer(x, mask)
            # x same as output batch_size, max_seq, embed_size
        output = self.layernorm(x)
        # otuput shape batch, seq
        logits = self.lm_head(output)

        if targets is None: 
            ce_loss = None
        else:
            # lets say the targets are next tokens ie the tokens shifted by 1 
            # batch, seq_len 
            # and we get the logits as batch, seq_len, vocab 
            targets = targets.view(-1)
            predictions = logits.view(-1,vocab_size) 
            loss = nn.CrossEntropyLoss()
            ce_loss = loss(predictions, targets)
            # this goes in the backward loss.backward()

        return logits,ce_loss# here y is the last token prob
    # lets add a generate call for this 
    # this is the decoding step which calls the froward and samples the token from this recursively 
    # this is the inference code 
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature = 1.0):
        self.eval()
        #idx shape is previous tokens processsed so (batch_size x seq_len )
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.max_seq_len:]
            logits ,_ = self.forward(idx_cond) # batch_size, seq_len, vocab_size
            
            # get logits from the last time step last seq token basically 
            last_token_logits = logits[:, -1, :] # b , vocab size 
            
            last_token_logits = last_token_logits / temperature
            last_token_probs = torch.softmax(last_token_logits, dim= -1) # the softmax is over all the vocab 
            # sampple the next token torch.multinomial samples an index based on the weights (probabilities)
            last_token = torch.multinomial(last_token_probs, num_samples=1) # B x1 
            # print("logits", logits.shape)
            # print(last_token.shape)
            idx = torch.cat((idx, last_token), dim = 1)
            # B, S+1
        return idx
vocab_size = 1000
embed_size = 64
max_seq_len = 128
num_heads = 4
num_layers = 4
batch_size = 4; seq_len = 10 ; embedding_dim = 20
model = GPT(embed_size, max_seq_len, vocab_size,num_heads,num_layers)

input_tensor = torch.randint(0, vocab_size, (batch_size,seq_len)) # (batch, seq_len)
targets = torch.randint(0, vocab_size, (batch_size, seq_len)) # The "correct" next tokens
# # pass the input as ranint rather than rand 
output = model.forward(input_tensor, targets )
print(input_tensor.shape)

print(output[1])
generated_seq = model.generate(input_tensor, 3)
print(generated_seq.shape)  #batch size x seq len 

torch.Size([4, 10])
tensor(6.9699, grad_fn=<NllLossBackward0>)
torch.Size([4, 13])


In [225]:
optimizer = torch.optim.Adam(model.parameters(), lr=.0001)
import torch
for _ in range(20): # these are 10 steps
    
    target_layer = model.attention_blocks[0].ffn[0].weight
    weights_before = target_layer.clone().detach()

    # --- Your Standard Training Step ---
    optimizer.zero_grad()
    logits, loss = model(input_tensor, targets=targets)
    loss.backward()
    optimizer.step()
    print(loss)

    # weights_after = target_layer

    # # 4. Compare a slice of the weights
    # print("--- Weight Inspection ---")
    # print(f"Loss: {loss.item():.4f}\n")
    # print(f"Slice of weights BEFORE update:\n{weights_before[0, :5]}\n")
    # print(f"Slice of weights AFTER update:\n{weights_after[0, :5]}\n")

    # # 5. Verify that a change occurred
    # difference = torch.sum(torch.abs(weights_before - weights_after))
    # print(f"Sum of absolute difference: {difference.item()}")
    # if difference > 0:
    #     print("\n✅ The weights have been successfully updated!")
    # else:
    #     print("\n❌ The weights did NOT change.")

tensor(30.8835, grad_fn=<NllLossBackward0>)
tensor(30.8662, grad_fn=<NllLossBackward0>)
tensor(30.8490, grad_fn=<NllLossBackward0>)
tensor(30.8318, grad_fn=<NllLossBackward0>)
tensor(30.8146, grad_fn=<NllLossBackward0>)
tensor(30.7975, grad_fn=<NllLossBackward0>)
tensor(30.7803, grad_fn=<NllLossBackward0>)
tensor(30.7632, grad_fn=<NllLossBackward0>)
tensor(30.7462, grad_fn=<NllLossBackward0>)
tensor(30.7291, grad_fn=<NllLossBackward0>)
tensor(30.7121, grad_fn=<NllLossBackward0>)
tensor(30.6951, grad_fn=<NllLossBackward0>)
tensor(30.6781, grad_fn=<NllLossBackward0>)
tensor(30.6612, grad_fn=<NllLossBackward0>)
tensor(30.6443, grad_fn=<NllLossBackward0>)
tensor(30.6274, grad_fn=<NllLossBackward0>)
tensor(30.6105, grad_fn=<NllLossBackward0>)
tensor(30.5937, grad_fn=<NllLossBackward0>)
tensor(30.5769, grad_fn=<NllLossBackward0>)
tensor(30.5601, grad_fn=<NllLossBackward0>)


In [None]:
# Character tokenizer 
import torch 
# This is not a torch.nn.Module because it doesn't have any learnable parameters.
class CharacterTokenizer: 
    def __init__(self, corpus):
        # find all the unique characters in the corpus 
        # map them to a dictionary 
        # reverse map it to the tokens 
        self.vocab = sorted(list(set(corpus.strip())))
        self.vocab_size = len(self.vocab)
        self.char_to_index = {char:i for i,char in enumerate(self.vocab)}
        self.index_to_char  = {i:char for i,char in enumerate(self.vocab)}
    def encode(self, text):
        indices = [self.char_to_index[char] for char in text]
        return torch.tensor(indices)
    def decode(self, indices):
        text = [self.index_to_char[idx.item()] for idx in indices]
        return "".join(text)


corpus = "piebfrhfbrfhchellols374t3842/,',v" 
tokenizer = CharacterTokenizer(corpus)
input_ids = tokenizer.encode("ello")
tokenizer.decode(input_ids)
# It's good practice to specify dtype=torch.long when creating index tensors. 
# PyTorch's nn.Embedding layer, which you'd use next, expects its input to be of type LongTensor.


'ello'