In [31]:
import torch 
import torch.nn.functional as F
import math
def scaled_dot_product_attention(query, key, value, mask=None):
    """
   Calculates the scaled dot-product attention.

   Args:
    query: Query tensor; shape (batch_size, num_heads, seq_len_q, d_k)
    key: Key tensor; shape (batch_size, num_heads, seq_len_k, d_k)
    value: Value tensor; shape (batch_size, num_heads, seq_len_v, d_v)
           Note: seq_len_k and seq_len_v must be the same.
    mask: Optional mask tensor; shape can be broadcastable to
          (batch_size, num_heads, seq_len_q, seq_len_k).

   Returns:
    A tuple containing:
    - output: The attention-weighted value tensor;
              shape (batch_size, num_heads, seq_len_q, d_v)
    - attention_weights: The attention weights;
                         shape (batch_size, num_heads, seq_len_q, seq_len_k)
    """
    dk = key.shape[-1]
    scores = torch.matmul(query, key.transpose(-1, -2)) # output shape will be  (batch_size, num_heads, seq_len_q, seq_len_k)
    scores = scores/ math.sqrt(dk)

    if mask is not None:
        scores = scores.masked_fill(mask, -1e9) # maksed fill more common than assing inf 
    assert value.shape[-2] == scores.shape[-1]
    scores = torch.softmax(scores, dim=-1)   # Softmax is applied on the last dimension (seq_len_k) to get weights for each query

    attention_weights = torch.matmul(scores, value) # shape (batch_size, num_heads, seq_len_q, d_v)
    return attention_weights, scores
    
#Complexity: What is the time and space complexity of this function with respect to the sequence length, N (assuming seq_len_q = seq_len_k = N)?
#The Scaling Factor: Why do we divide by dk? What would happen if we didn't?

#Masking: In the context of a vanilla Transformer decoder (like in GPT), what is the specific name for the mask you would use here, and what is its purpose?
# time complexity O(N2) matmul are O(n2 )
# sclae by dk to get stable training , dk is the variance for the batch size 
# causal MAsk for decoder, and this is doen so that the attention is only applied to the previosu tokens when processing the current tokens
# this presebves the causal nature of the  


# Detailed Breakdown:
# The key is to understand what happens to the softmax function with large inputs.
# The Problem: Let's assume the components of the query and key vectors are independent random variables with a mean of 0 and a variance of 1. The dot product of two such vectors, qâ‹…k, will have a mean of 0 but a variance of d 
# k.The Consequence: If dk is large (e.g., 512), the dot product scores can have a very large variance, meaning some values will be very large and others very small. When you feed these large-magnitude numbers into a softmax function, it pushes the probabilities to either 0 or 1.
# Vanishing Gradients: When the softmax output is saturated at 0 or 1, its gradient becomes extremely close to zero. This is the "vanishing gradient" problem. It means that very little signal flows back during backpropagation, and the model struggles to learn.
# The Solution: By dividing the scores by  d k(the standard deviation), we scale the variance back down to 1, keeping the inputs to the softmax in a more reasonable range. This leads to healthier gradients and more stable training.



In [32]:
# Multihead attention
#Instead of performing a single attention calculation, Multi-Head Attention runs the scaled dot-product attention
# mechanism multiple times in parallel. Each parallel run is called a "head." This allows the model to jointly attend 
# to information from different representation subspaces at different positions. It's like having a committee of 
# experts; each expert (head) focuses on a different aspect of the input, and their insights are combined for a final
# decision.

import torch
import torch.nn as nn
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        """
        Args:
            d_model: The dimensionality of the input and output.
            num_heads: The number of attention heads.
        """
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.w_q = nn.Linear(self.d_model, self.d_model)
        self.w_k = nn.Linear(self.d_model, self.d_model)
        self.w_v = nn.Linear(self.d_model, self.d_model)
        self.w_o = nn.Linear(self.d_model, self.d_model) #this takes the final layers as input after combining the heads and then reweightign the alyers 

    def forward(self, query, key, value, mask ):

        # shape of these is (batch_size, seq_len_q, d_model)
        # split them into heads and dk 
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)

        batch_size = query.shape(0)
        Q = Q.view(batch_size, self.num_heads,  -1, self.d_k).transpose(1,2)
        V = V.view(batch_size, self.num_heads,  -1, self.d_k).transpose(1,2)
        K = K.view(batch_size, self.num_heads,  -1, self.d_k).transpose(1,2)


        # now we have input for each head the attentino works on these now 
        output, scores = scaled_dot_product_attention(Q, K, V, mask=None)
        # output shape is same as query (batch_size, self.num_heads, seq_len  , self.d_k)
        output = output.transpose(1,2).contiguous().view(batch_size, -1, self.d_model)
        output = self.w_o(output)
        return output, scores
'''
torch.view() requires the tensor to be contiguous in memory.
 A contiguous tensor is one where all its elements are stored sequentially in a single block of memory.
   It doesn't create a new copy of the data; it just changes how PyTorch "views" the existing memory block, 
   making it very fast and memory-efficient.

torch.reshape() is more flexible. If the tensor is already contiguous, it acts just like view(). 
However, if the tensor is not contiguous (e.g., after a transpose() operation), reshape() will implicitly
 create a copy of the tensor with a contiguous layout before reshaping it.
'''
'''
Follow ups:
Question: Why do we use one big nn.Linear(d_model, d_model) and then reshape, instead of creating num_heads smaller nn.Linear(d_k, d_k) layers and running them in a loop?
one large matrix multiplication is faster and efficient than num_heads small matrix multiplications
'''


'\nFollow ups:\nQuestion: Why do we use one big nn.Linear(d_model, d_model) and then reshape, instead of creating num_heads smaller nn.Linear(d_k, d_k) layers and running them in a loop?\none large matrix multiplication is faster and efficient than num_heads small matrix multiplications\n'

In [33]:
# building the whole transformer block 
import torch
import torch.nn as nn
import torch.nn.functional as F
# Assume the MultiHeadAttention class we built is available
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        """
        Args:
            d_model: The dimensionality of the input and output (must be the same).
            num_heads: The number of attention heads.
            d_ff: The dimensionality of the inner layer of the FFN.
            dropout: The dropout rate.
        """
        super(TransformerEncoderBlock, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = torch.nn.LayerNorm(d_model)
        self.norm2 = torch.nn.LayerNorm(d_model)
        self.ffn  = nn.Sequential(torch.nn.Linear(d_model, 4*d_model),
                                  torch.nn.ReLU(),
                                  torch.nn.Linear(4*d_model, d_model))
        self.dropout = nn.Dropout(dropout=dropout)
    def forward(self, x, mask = None):
        # x is the embedding 
        # norm -> sublayer -> dropout -> add residual
        residual = x
        # (batch_size, seq_len, d_model)
        x_norm = self.norm1(x)
        attention_output, weights = self.attention(x_norm,x_norm, x_norm, mask)
        x = residual  + self.dropout(attention_output)
        # pre norm 
        residual = x 
        x_norm = self.norm2(x)
        # combine first and then do norm
        ffn_output = self.ffn(x_norm)
        x = residual+ self.dropout(ffn_output )
        return x


# whihc is stable pre LN or Post LN
# The key problem Pre-LN solves is the exploding gradient issue in very deep Transformers.
# In the Post-LN architecture, the output of each block is normalized, but the residual connection path is not. As you stack many blocks, the gradients flowing backward through the additions (+) can accumulate and become very large, leading to unstable training.

#In the Pre-LN architecture, the normalization layer is placed directly on the main residual path. 
# This means that at the start of every block, the gradients are "reset" or rescaled by the LayerNorm.
#  This keeps the gradient magnitudes well-behaved throughout the entire network, allowing for much more stable training, 
# especially for models with dozens or hundreds of layers. It often removes the need for learning rate warm-up schedules. ðŸ“ˆ


# what is hte role of FFN?
# it adds non linearlity to the otherwise linear attentoin block , 


In [34]:
# positional encoding 
# shape max_seq_len x d_model 
# uses self.register_buffer('pe', pe) to tell torch to not include this in the trainable parameters, these are fixed paramters defined for each index in the seq length 
# lets make the original positional encoding matrix now 
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len= 100000):
        super().__init__()
        pe = torch.randn(d_model, max_seq_len)
        position  = torch.arange(0, max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x):
        # we simply add the pe to x 
        # shape of x batch_size, seq, d 
        # shape of pe 1, seq, max_seq_len
        self.pe = self.pe.unsqueeze(0)
        x = x + self.pe[:, :x.size(1)]





In [35]:
import torch
import torch.nn as nn

# Assume TransformerEncoderBlock and PositionalEncoding are defined from previous exercises

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout=0.1, max_seq_len=5000):
        """
        Args:
            vocab_size: The size of the vocabulary.
            d_model: The dimensionality of the embeddings.
            num_heads: The number of attention heads.
            d_ff: The dimensionality of the inner layer of the FFN.
            num_layers: The number of TransformerEncoderBlocks to stack.
            dropout: The dropout rate.
            max_seq_len: The maximum possible sequence length.
        """
        super(TransformerEncoder, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Your code here:
        # 1. An embedding layer for the input tokens.
        # self.embedding = ...

        # 2. A positional encoding layer.
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len)

        # 3. A stack of N encoder blocks. Use nn.ModuleList.
        self.encoder_layers = nn.ModuleList(
            [TransformerEncoderBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
        )

        # 4. A dropout layer for the embeddings.
        self.dropout = nn.Dropout(p=dropout)


    def forward(self, src, src_mask=None):
        """
        Args:
            src: The input token IDs; shape (batch_size, seq_len)
            src_mask: The attention mask for the source sequence.

        Returns:
            The output of the final encoder block; shape (batch_size, seq_len, d_model)
        """
        # Your code here:
        # 1. Pass the input through the embedding layer.
        # 2. Scale the embeddings by sqrt(d_model) - a common practice.
        # 3. Add the positional encodings.
        # 4. Apply dropout.
        # 5. Pass the result through the stack of encoder layers.
        src_emb = self.embedding(src)
        src_emb = src_emb * torch.sqrt(self.d_model)
        src_emb = self.pos_encoder(src_emb)
        output = self.dropout(src_emb)
        for layer in self.encoder_layers:
            output = layer(output, mask=src_mask)
        return output 
# Embedding scaling 
# The scaling is done to adjust the relative importance of the token embedding compared to the positional encoding.
#Token Embedding: The nn.Embedding layer is typically initialized with weights from a standard normal distribution (mean 0, variance 1).
# Positional Encoding: The values from our sine/cosine functions are always between -1 and 1, and their variance is around 0.5.
# Without scaling, when we add these two together, the token embedding has a larger magnitude and might overshadow the positional information.
# By scaling the embedding by  d model, we increase its magnitude. This makes the positional encoding a smaller, more subtle signal that is added to the much stronger token signal.



In [36]:
# decoder only block 
class DecoderOnlyBlock(nn.Module):

    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.d_model = d_model 
        self.d_k  = d_model/ num_heads 
        self.num_heads = num_heads
        # self.w_q = nn.Linear(d_model, d_model )
        # self.w_k = nn.Linear(d_model, d_model )
        # self.w_v = nn.Linear(d_model, d_model)
        # self.W_o = nn.Linear(d_model, d_model)
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = torch.nn.LayerNorm(d_model)
        self.norm2 = torch.nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(nn.Linear(d_model, 4*d_model), nn.ReLU(), nn.Linear(d_model*4, d_model))
        self.dropout = nn.Dropout(dropout =dropout )
    def forward(self, x,  mask):
        residual = x
        x_norm = self.norm1(x)
        x_attention, _ = self.attention(x_norm, x_norm, x_norm,mask )
        x = residual + self.dropout(x_attention)

        residual = x 
        x_norm = self.norm2(x)
        ffn_output = self.ffn(x_norm)
        x = residual + self.dropout(ffn_output)
        return x 

        # q = self.w_q(query)
        # k = self.w_k(key)
        # v = self.w_v(value)
        # batch_size = q.shape(0)
        # seq_len = q.shape(1)

        # # now we can split over batch size, seq_len, d_model to batch size, num_heads, seq_len, d_k
        # q = q.view(batch_size, -1, self.num_heads, self.d_k ).transpose(1, 2)
        # k = k.view(batch_size, -1, self.num_heads, self.d_k ).transpose(1, 2)
        # v = v.view(batch_size, -1, self.num_heads, self.d_k ).transpose(1, 2)

        # # now pass them through the attention block 
        # # we need to define the mask as well 
        # mask = torch.ones(seq_len, seq_len )
        # # keep only the lower traingular matrix 
        # # idk hwo to do this 
        # attention_output,_ = scaled_dot_product_attention(q,k,v, mask = mask )
        # # this will be in the shape of batch size, num_heads, seq_len, d_k
        # # now combien them and then pass the Wo matrix 

        # attention_output = attention_output.transpose(1,2)
        # attention_output = attention_output.contiguous().view(batch_size, -1, self.d_model)
        # output = self.W_o(attention_output)

        return output 


In [37]:
mask = torch.triu(torch.ones(10, 10), diagonal=1)
mask = mask.bool()
mask

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False,  True,  True,  True,  True],
        [False, False, False, False, False, False, False,  True,  True,  True],
        [False, False, False, False, False, False, False, False,  True,  True],
        [False, False, False, False, False, False, False, False, False,  True],
        [False, False, False, False, False, False, False, False, False, False]])

In [38]:
# gpt problem 
import torch
import torch.nn as nn
import math

# Assume DecoderOnlyBlock and PositionalEncoding are defined from previous exercises
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dropout, max_seq_len):
        super().__init__() # to properly initilaise the nn.Module class 
        self.d_model = d_model 
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.dk = d_model/num_heads
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len)
        self.decoder_blocks = nn.ModuleList([DecoderOnlyBlock(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
         
        self.final_norm = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask):
        # (batch_size, seq_len)
        x = self.token_embedding(x)* math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.dropout(x)
        # pass through attention 
        for layer in self.decoder_blocks: 
            x = layer(x,mask = mask )
        # final layernorm 
        output = self.final_norm(output)
        output = self.lm_head(output)
        return output

In [54]:
# LLM inference and optimisations
# KV cache , optimises the token generation pipeline ofro N tokens from O(N3) summation O(n2) for each token = n3 to 
# O(n) for each n tokens = O(n2) complexity 
class MultiHeadAttentionWithKVCache(nn.Module):
    def __init__(self,  d_model, num_heads):
        super().__init__()

        # define your QKV queries 
        self.w_q = nn.Linear(d_model, d_model )
        self.w_k = nn.Linear(d_model, d_model )
        self.w_v = nn.Linear(d_model, d_model )
        self.w_o = nn.Linear(d_model, d_model )
        self.d_model = d_model 
        self.num_heads = num_heads 
        self.dk = self.d_model//num_heads

    def forward(self,query, key, value, mask=None, kv_cache=None):
        batch_size= key.size(0)
        Q = self.w_q(query).view(batch_size, -1, self.num_heads,self.dk ).transpose(1,2)
        key_new = self.w_k(key).view(batch_size, -1, self.num_heads,self.dk ).transpose(1,2)
        value_new = self.w_v(value).view(batch_size, -1, self.num_heads,self.dk ).transpose(1,2)
        print("key_new", key_new.shape)
        if kv_cache is not None:
            # concatenate the new KV values 
            key_cache, value_cache = kv_cache
            print(key_cache.shape)
            K = torch.cat([key_cache, key_new],dim=2)
            V = torch.cat([value_cache, value_new],dim=2)
            print(K.shape)
            print(V.shape)
        else:
            K,V = key_new, value_new
        updated_kv_cache = (K, V)
        context, _ = scaled_dot_product_attention(Q, K, V, mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.w_o(context)
        return output, updated_kv_cache



        


In [57]:
batch_size = 1
d_model = 12
num_heads = 4
seq_len_prompt = 5
mha_kv = MultiHeadAttentionWithKVCache(d_model=d_model, num_heads=num_heads)
prompt = torch.randn(batch_size, seq_len_prompt, d_model)
print(f"Input shape: {prompt.shape}")
# First forward pass with no cache
output, kv_cache = mha_kv(prompt, prompt, prompt, kv_cache=None)
print(output.shape)
print(kv_cache[0].shape)
print(kv_cache[1].shape)
print("--- Step 2: Generating the 1st New Token ---")
# Create a tensor for a single new token
new_token_1 = torch.randn(batch_size, 1, d_model)
print(f"Input shape: {new_token_1.shape}")
output, kv_cache = mha_kv(new_token_1, new_token_1, new_token_1, kv_cache=kv_cache)
print(kv_cache[0].shape)

new_token_2 = torch.randn(batch_size, 1, d_model)
print(f"Input shape: {new_token_2.shape}")
# Third forward pass, feeding the updated cache
output, kv_cache = mha_kv(new_token_2, new_token_2, new_token_2, kv_cache=kv_cache)
print(f"Output shape: {output.shape}")
print(f"Returned Key Cache shape: {kv_cache[0].shape}\n")


Input shape: torch.Size([1, 5, 12])
key_new torch.Size([1, 4, 5, 3])
torch.Size([1, 5, 12])
torch.Size([1, 4, 5, 3])
torch.Size([1, 4, 5, 3])
--- Step 2: Generating the 1st New Token ---
Input shape: torch.Size([1, 1, 12])
key_new torch.Size([1, 4, 1, 3])
torch.Size([1, 4, 5, 3])
torch.Size([1, 4, 6, 3])
torch.Size([1, 4, 6, 3])
torch.Size([1, 4, 6, 3])
Input shape: torch.Size([1, 1, 12])
key_new torch.Size([1, 4, 1, 3])
torch.Size([1, 4, 6, 3])
torch.Size([1, 4, 7, 3])
torch.Size([1, 4, 7, 3])
Output shape: torch.Size([1, 1, 12])
Returned Key Cache shape: torch.Size([1, 4, 7, 3])



In [None]:
# Temperature and Top-K Sampling
import torch
import torch.nn.functional as F

def sample_top_k_with_temperature(logits, temperature=1.0, k=50):
    # logits shape 1 x V 
    if temperature ==0 :
        sample = torch.argmax(logits,dim=-1)
        # decode this 
        return sample.item()
    logits = logits / temperature
    top_k_logits ,top_k_indices = torch.topk(logits)
    
    top_k_probs =  F.softmax(top_k_logits, dim=-1)
    sampled_index_in_top_k = torch.multinomial(top_k_probs, num_samples=1)
    final_token_id = top_k_indices[0, sampled_index_in_top_k.item()].item()
    return final_token_id



In [61]:
logits = torch.randn(1, 10)
logits = torch.sort(logits)
logits 

torch.return_types.sort(
values=tensor([[-1.4046, -1.1718, -1.1229, -0.8169, -0.7663, -0.7543, -0.6873, -0.4600,
          1.1832,  1.4680]]),
indices=tensor([[3, 8, 9, 2, 5, 6, 0, 7, 4, 1]]))

In [None]:
# Temperature and nucleus Sampling where k is not fixed so allows for more creativity 
import torch
import torch.nn.functional as F
logits = torch.randn(1, 10)

# def sample_top_p(logits, p=0.9):    
# sort the logits 
sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
print(sorted_indices)
print(sorted_logits)
# sorted probs wiht htis 
sorted_probs = torch.softmax(sorted_logits, dim=-1)
print(sorted_probs)
cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
# 4. Create a boolean mask for tokens to remove.
# We want to remove all tokens that appear *after* the cumulative
# probability has exceeded the threshold 'p'.
# We shift the cumulative probabilities right by one and compare to 'p'.
# This ensures that the token that pushes the sum *over* p is included.
shifted_cumulative_probs = F.pad(cumulative_probs[:, :-1], (1, 0), "constant", 0)
indices_to_remove = shifted_cumulative_probs > p

# 5. Set the logits of the tokens to be removed to negative infinity.
# This effectively removes them from the running for the next softmax.
# We use the sorted_indices to find the correct original positions to mask.
# The .scatter_ method is a clean way to apply this mask.
logits[indices_to_remove.scatter(1, sorted_indices, indices_to_remove)] = -float('Inf')
final_probs = F.softmax(logits, dim=-1)
    
# 7. Sample one token from this final, filtered distribution.
sampled_token_id = torch.multinomial(final_probs, num_samples=1)

print(sampled_token_id)

tensor([[0, 9, 2, 7, 3, 5, 1, 8, 4, 6]])
tensor([[ 0.9664,  0.8910,  0.4614,  0.2874,  0.1262, -0.1792, -0.3994, -0.8468,
         -1.2610, -1.5057]])
tensor([[0.2274, 0.2109, 0.1372, 0.1153, 0.0981, 0.0723, 0.0580, 0.0371, 0.0245,
         0.0192]])
