In [13]:
import torch 
query = torch.tensor([1.0, 0.0, 0.0])
key = torch.tensor([1.0, 0.0, 1.0])
query.shape[0]

3

In [22]:
d_k = key.shape[-1]                   # this is an int
import math 
scale = math.sqrt(d_k)   
attention_score = torch.matmul(query, key.T)/scale
print(attention_score)
attention_score

tensor(0.5774)


tensor(0.5774)

In [23]:
# Causal atttention block implementation 
import torch 
import torch.nn as nn
class CausalAttentionBlock(torch.nn.Module):
    def __init__(self, d_model, n_heads, block_size, dropout = 0.1):
        super().__init__() # whta does this do ?? 
        assert d_model % n_heads == 0, "d_model must be divisible by n_head"
        self.d_model = d_model 
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.projection =  nn.Linear(d_model, d_model, bias=False)
        self.attn_dropout = nn.Dropout(dropout)
        # block_size is the max seq_length here 
        self.mask = torch.tril(torch.ones(block_size, block_size))
        self.register_buffer("mask", self.mask.view(1, 1, block_size, block_size))

    def forward(self, x):
        batch, seq_len, d_model = x.shape
        # now we can split this tp head 
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        # get the head split and transpose 
        q = q.view(batch, seq_len, self.n_heads, self.d_head).transpose(1, 2) 
        v = v.view(batch, seq_len, self.n_heads, self.d_head).transpose(1, 2)
        k = k.view(batch, seq_len, self.n_heads, self.d_head).transpose(1, 2)

        # now we can compute the attention for all these heads since last two matrices are gettign multiplied 
        attention_score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head) # batch , head, seq_len, seq_len
        # now we need to mask this 
        attention_score = attention_score.masked_fill(self.mask[:, :, :seq_len, :seq_len] == 0, float('-inf'))
         
        attention_score = torch.softmax(attention_score, dim=-1)
        attn = self.attn_dropout(attention_score)  #
        # now we can compute the attention
        out = torch.matmul(attn, v) #(B, H, T, d_head)
        out = out.transpose(1,2).contiguous().view(batch, seq_len, d_model)
        out = self.projection(out) 
        return out 

In [27]:
### RMS NORM 
x = torch.randn(2, 3, 4)
print(x.shape)
class RMSNorm(nn.Module):
    def __init__(self, epsilon= 1e-15):
        super().__init__()
        self.epsilon = epsilon 
        self.scale = nn.Parameter(torch.ones(1))
    def forward(self,x):
        norm = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.epsilon)
        return (x / norm) * self.scale
rms = RMSNorm()
y = rms(x)
y.shape

torch.Size([2, 3, 4])


torch.Size([2, 3, 4])

In [58]:
# mutli head attention 
def multihead_attention(q,k,v, d_model, n_heads, mask=None):
    """
    Implements multi-head attention.
    
    Args:
        q (Tensor): Query tensor of shape (batch_size, seq_len, d_model)
        k (Tensor): Key tensor of shape (batch_size, seq_len, d_model)
        v (Tensor): Value tensor of shape (batch_size, seq_len, d_model)
        num_heads (int): Number of attention heads
        d_model (int): Total embedding dimension
        mask (Tensor, optional): Masking tensor for attention
    Returns:
        Tensor: Multi-head attention output of shape (batch_size, seq_len, d_model)
    """
    d_head = d_model // n_heads
    device = q.device
    batch_size, seq_len, _ = q.shape
    Q = nn.Linear(d_model, d_model , bias=False).to(device )
    K = nn.Linear(d_model, d_model, bias=False).to(device )
    V=  nn.Linear(d_model, d_model, bias=False).to(device )
    W_out = nn.Linear(d_model, d_model, bias=False).to(device)

    # now we get the projections for q,k,v for each head 
    q = Q(q)
    k = K(k)
    v = V(v)

    q = q.view(batch_size, seq_len, n_heads, d_head).transpose(1, 2)
    k = k.view(batch_size, seq_len, n_heads, d_head).transpose(1, 2)
    v = v.view(batch_size, seq_len, n_heads, d_head).transpose(1, 2)

    # now we can compute the attention for all these heads since last two matrices are gettign multiplied
    attention_score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_head) # batch , head, seq_len, seq_len
    if mask is not None:
        # assume mask is a tensor of shape (batch_size, seq_len, seq_len)
        attention_score = attention_score.masked_fill(mask == 0, float('-inf'))
    attention_score = torch.softmax(attention_score, dim=-1)
    # nw mutliply by V 
    output = torch.matmul(attention_score, v) # (batch_size, n_heads, seq_len, d_head)
    output = output.transpose(1,2).contiguous().view(batch_size, seq_len, d_model)
    return W_out(output)


In [59]:
torch.manual_seed(42)
batch_size = 3
seq_len = 4
d_model = 8
n_heads = 2

q = torch.rand(batch_size, seq_len, d_model)
k = torch.rand(batch_size, seq_len, d_model)
v = torch.rand(batch_size, seq_len, d_model)
print(q.shape)


device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
output_custom = multihead_attention(q, k, v, d_model, n_heads)


torch.Size([3, 4, 8])


In [60]:
output_custom.shape

torch.Size([3, 4, 8])

In [61]:
multihead_attn = torch.nn.MultiheadAttention(embed_dim=d_model, num_heads=n_heads, bias=False, batch_first=True)
output, _ = multihead_attn(q, k, v)

In [65]:
assert torch.allclose(output_custom, output, atol=1e-08, rtol=1e-05) # Check if they are close enough.


AssertionError: 

In [66]:
###########Sinusoidal PE ##############
class SinusoidalPositionalEmbedding(nn.Module):
    def __init__(self,max_seq_len, d_model):
        super().__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        pe = torch.zeros(max_seq_len, d_model)
        pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(pe)
        # this is very annoying we will do this later 

In [73]:
# lets do grouped query attention, 
# instead of the grouping by heads with equal number of q,k,v matrices, 
# split the number of queries in groups

torch.manual_seed(42)
batch_size = 3
seq_len = 4
d_model = 32
num_heads = 2

q = torch.rand(batch_size, seq_len, d_model)
k = torch.rand(batch_size, seq_len, d_model)
v = torch.rand(batch_size, seq_len, d_model)
print(q.shape)

device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

torch.Size([3, 4, 32])


In [97]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
num_heads = 8
batch_size = 3
seq_len = 4
d_model = 32

q = torch.rand(batch_size, seq_len, d_model)
k = torch.rand(batch_size, seq_len, d_model)
v = torch.rand(batch_size, seq_len, d_model)
num_query_groups = 4 
#def grouped_query_attention(q, k, v, num_query_groups, d_model, mask=None):
# so given qk v and q_groups, we can calculate the attention scores 
# and then we can split the q and v into num_query_groups
d_head = d_model // num_heads
batch_size, seq_len, d_model = q.shape
Q = nn.Linear(d_model, d_model, bias=False).to(device )
K = nn.Linear(d_model,  num_query_groups * d_head, bias=False).to(device )
V = nn.Linear(d_model,  num_query_groups * d_head, bias=False).to(device ) 
W_out = nn.Linear(d_model, d_model, bias=False).to(device)
# Q operates over all heads so its final dimension is d_model 
# KV operates over all the d_head its shared across head??

q = Q(q)
k = K(k)
v = V(v)
print(q.shape)
print(k.shape)
# now we can split the q into q groups 
# last dimenaion is d_model , we split that into num_query_groups
q = q.view(batch_size, seq_len, num_heads, d_head) # [batch_size, seq_len, num_query_groups, d_head]
# now we have to do 
v = v.view(batch_size, seq_len, num_query_groups, d_head) # [batch_size, seq_len, num_query_groups, d_head]
k = k.view(batch_size, seq_len, num_query_groups, d_head) # [batch_size, seq_len, num_query_groups, d_head]

# now we can prepare for scaled dot product, first transpose 
q = q.transpose(1,2)
k = k.transpose(1,2)
v = v.transpose(1,2)
# no of repititions = no of heads/ no of query groups
# now repeat kv fro each head 
k = k.repeat_interleave(num_heads//num_query_groups, dim=1)
v = v.repeat_interleave(num_heads//num_query_groups, dim= 1)
print(k.shape)
# so the shapes match now 
print(q.shape)

# great 
# now we can compute the attention for all these heads since last two matrices are gettign multiplied
attention_score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_head) # batch , head, seq_len, seq_len
print(attention_score.shape)
attention_score = torch.softmax(attention_score, dim=-1)
attention_score = torch.matmul(attention_score, v) # (batch_size, n_heads, seq_len, d_head)
print(attention_score.shape)
# now we need to get it back to batch, seq, d_model dimension and apply W_o 

attention_score = attention_score.transpose(1,2).contiguous()
print(attention_score.shape)
# and combine the last two dimension 
attention_score = attention_score.view(batch_size, seq_len, d_model)
print(attention_score.shape)
output = W_out(attention_score)
print(output.shape  )

torch.Size([3, 4, 32])
torch.Size([3, 4, 16])
torch.Size([3, 8, 4, 4])
torch.Size([3, 8, 4, 4])
torch.Size([3, 8, 4, 4])
torch.Size([3, 8, 4, 4])
torch.Size([3, 4, 8, 4])
torch.Size([3, 4, 32])
torch.Size([3, 4, 32])


In [23]:
# Create Embeddings out of an LLM
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
#dataset = load_dataset("gmongaras/Amazon-Reviews-2023", trust_remote_code=True)
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
text = "hello how are you "
texts = [text] * 10
encodings = tokenizer(texts, return_tensors="pt", padding=False, truncation=True)
print(encodings["input_ids"].shape)
input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# 4. Forward pass with output_hidden_states=True to get all hidden states
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)


torch.Size([10, 5])


In [None]:
print("vocab size",outputs.logits.shape)
last_hidden_states = outputs.hidden_states[-1]
print(last_hidden_states.shape)
# last hidden state gives us the token embedding for each token in the sequence
# we can use this to compute sentence embeddings by averaging token embeddings excluding padding tokens
sentence_embeddings = torch.mean(last_hidden_states * attention_mask.unsqueeze(-1), dim=1)
expanded_mask = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()


vocab size torch.Size([10, 5, 49152])
torch.Size([10, 5, 576])


In [58]:
sum_embeddings = torch.sum(last_hidden_states * expanded_mask, dim=1)
sum_mask = torch.clamp(expanded_mask.sum(dim=1), min=1e-9)  # avoid division by zero
sentence_embeddings = sum_embeddings / sum_mask  # (batch_size, hidden_dim)
print("Sentence embeddings shape:", sentence_embeddings.shape)  # (10, hidden_dim)


Sentence embeddings shape: torch.Size([10, 576])
