self-attention, numpy method:

In [11]:
import numpy as np

def compute_qkv(X, W_q, W_k, W_v):
	Q = X @ W_q
	K = X @ W_k
	V = X @ W_v
	return Q, K, V

def self_attention(Q, K, V):
	dim_K = K.shape[-1]
	scores = (Q @ K.T)/np.sqrt(dim_K)
	max_scores = np.max(scores, axis = -1, keepdims = True)
	new_scores = scores - max_scores
	attention_weights = np.exp(new_scores) / np.sum(np.exp(new_scores),axis = -1, keepdims = True)
	attention_output = attention_weights @ V
	return attention_output

self-attention, pytorch method:

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
  def __init__(self, embed_dim):
    super().__init__()
    self.embedding_dim = embed_dim
    self.get_q = nn.Linear(embed_dim, embed_dim, bias= False)
    self.get_k = nn.Linear(embed_dim, embed_dim, bias= False)
    self.get_v = nn.Linear(embed_dim, embed_dim, bias= False)

  def forward(self, x, mask= False):  # x.shape is [B,s,e] B: batch size, s: sequence length, e: embedding size
    Q = self.get_q(x)
    K = self.get_k(x) # dim K is in each batch the embedding size = self.embedding_dim
    V = self.get_v(x) # has shape [B,s,e]
    attention_scores = torch.matmul(Q,K.transpose(-2,-1))/(self.embedding_dim**(-0.5))
    max_scores = torch.max(attention_scores, dim =-1, keepdim= True)[0] # get max value of each row, not indices
    new_scores = attention_scores - max_scores # torch will auto expand dim so every entry in a row - max value, and in softmax exp(negative value) won't be huge
    attention_weights = torch.softmax(new_scores, dim =-1) # has shape [B,s,s]
    attention_output = torch.matmul(attention_weights, V) # out put has same dim as x
    return attention_output





In [6]:
# --- 2) Create a dummy batch of embeddings ---
B, T, C = 2, 5, 16           # batch size 2, sequence length 5, embed dim 16
x = torch.randn(B, T, C)     # random input

# --- 3) Instantiate and run ---
attn_layer = SelfAttention(embed_dim=C)
output = attn_layer(x)

print("output shape:", output.shape)   # → torch.Size([2, 5, 16])

# --- 4) (Optional) Visualize one attention matrix ---
print("attention weights for sample 0:\n", output[0])

output shape: torch.Size([2, 5, 16])
attention weights for sample 0:
 tensor([[ 0.4882, -0.1974,  0.4953,  0.0751,  0.3127, -0.0228, -0.0331,  0.3205,
         -0.1828,  0.3713, -0.3189, -0.0096, -0.2000,  0.1807,  0.4232,  0.1530],
        [ 0.5012,  0.0043, -0.5890, -0.1467, -0.4039, -0.6152, -0.8488,  0.2181,
         -1.1083, -0.2581,  1.3502,  0.3465, -0.9484, -0.3295,  0.2146, -0.1683],
        [ 0.6680, -0.3255,  0.2919,  0.0046,  0.2490, -0.2500, -0.0585,  0.1657,
          0.0210,  0.3828, -0.3388,  0.0551, -0.1739, -0.0188,  0.2955,  0.2897],
        [-0.0268,  0.1069,  1.0452,  0.2255,  0.4878,  0.6085, -0.0106,  0.7298,
         -0.7010,  0.3331, -0.2881, -0.1790, -0.3185,  0.7287,  0.7232, -0.2406],
        [-0.2968, -0.5228,  0.5184, -0.4663,  0.2590,  0.4869, -0.9638,  0.4480,
         -0.3621,  0.0240, -0.0476,  0.0554, -1.2374,  0.4955, -0.1072, -0.5704]],
       grad_fn=<SelectBackward0>)
