In [None]:
import numpy as np
import pandas as pd
import tiktoken as tk
import torch
import torch.nn as nn
import torch.nn.functional as F

## Input-
- for Embedding -> X in shape (B, T)
- for SingleHeadAttention -> Embedded X in shape (B, T, d_model)

## Output- 
- Embedding -> token + position embedded x matrix with dimensions (B,T,d_model)
- SingleHeadAttention -> "Out" + "attn", Out is the result of attention mechanism, the matrix to be used in the model, "Attn" is softmax scores of each token

Master notebook for more details

In [None]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, d_model, max_len=2048):
        super().__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)   # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding

    def forward(self, x):
        B, T = x.shape
        # make position IDs: [0, 1, ..., T-1]
        pos = torch.arange(0, T, device=x.device).unsqueeze(0)  # [1, T]
        tok = self.tok_embed(x)       # [B, T, d_model]
        pos = self.pos_embed(pos)     # [1, T, d_model]
        return tok + pos              # [B, T, d_model]

In [None]:
class SingleHeadAttention(nn.Module):
    def __init__(self, d_model, d_k):
        super().__init__()
        
        
        # Linear projections for Q, K, V
        self.W_q = nn.Linear(d_model, d_k, bias=False)
        self.W_k = nn.Linear(d_model, d_k, bias=False)
        self.W_v = nn.Linear(d_model, d_k, bias=False)
        
    
    
    def forward(self, x):
        # x shape: (B, T, d_model)
        Q = self.W_q(x)  
        K = self.W_k(x)  
        V = self.W_v(x) 
        
        
        # Attention scores: QK^T / sqrt(d_k) jus like the goddamn paper it was hell to code aaaaaaaaaaaaa
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)

        
        # Softmax over last dim
        attn = F.softmax(scores, dim=-1)
        
        ##Weighted sum with V
        out = torch.matmul(attn, V)  # (batch_size, seq_len, d_k)
        return out, attn


In [1]:
embed = EmbeddingLayer(vocab_size, d_model, max_len)
attn = SingleHeadAttention(d_model, head_dim)


emb = embed(x)   # [B, T, d_model]

out = attn(emb)  # [B, T, d_model]

print(emb)
print(out)

NameError: name 'EmbeddingLayer' is not defined