### IMPLEMENTING SELF ATTENTION WITH TRAINABLE WEIGHTS

In [1]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your      (x^1)
     [0.55, 0.87, 0.66],  # journey   (x^2)
     [0.57, 0.85, 0.64],  # starts    (x^3)
     [0.22, 0.58, 0.34],  # with      (x^4)
     [0.77, 0.25, 0.10],  # one       (x^5)
     [0.05, 0.81, 0.55]]  # step      (x^6)
 )

### Initalize Wk,Wq,WV matrices

In [2]:
d_in = 3
d_out = 2

In [3]:
torch.manual_seed(123)
w_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad= False)
w_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad= False)
w_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad= False)

In [4]:
print(w_q)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


In [5]:
print(w_k)

Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])


In [6]:
print(w_v)

Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [7]:
input = inputs[1]

In [8]:
query_2  =  input @ w_q
key_2 = input @ w_k
value_2 = input @ w_v

print(query_2)

tensor([0.4306, 1.4551])


In [9]:
keys = inputs @ w_k
queries = inputs @ w_q
values = inputs @ w_v


print("Keys Shape:", keys.shape)
print("queries Shape:", queries.shape)
print("values Shape:", values.shape)

Keys Shape: torch.Size([6, 2])
queries Shape: torch.Size([6, 2])
values Shape: torch.Size([6, 2])


In [14]:
queries_2 = queries[1]
attn_scores_2 = queries_2 @ keys.T
attn_scores_2

tensor([1.2705, 1.8524, 1.8111, 1.0909, 0.5577, 1.5553])

In [17]:
attn_scores = queries @ keys.T
print(attn_scores)

tensor([[0.9231, 1.3545, 1.3241, 0.7993, 0.4032, 1.1413],
        [1.2705, 1.8524, 1.8111, 1.0909, 0.5577, 1.5553],
        [1.2544, 1.8284, 1.7877, 1.0767, 0.5508, 1.5350],
        [0.7042, 1.0269, 1.0040, 0.6048, 0.3091, 0.8624],
        [0.6114, 0.8819, 0.8626, 0.5176, 0.2707, 0.7360],
        [0.9057, 1.3255, 1.2958, 0.7815, 0.3964, 1.1153]])


In [18]:
d_k = keys.shape[-1]

attn_weights = torch.softmax(attn_scores/ d_k**0.5, dim = -1)
attn_weights

tensor([[0.1548, 0.2100, 0.2055, 0.1418, 0.1072, 0.1806],
        [0.1496, 0.2258, 0.2193, 0.1318, 0.0904, 0.1830],
        [0.1500, 0.2250, 0.2187, 0.1323, 0.0912, 0.1829],
        [0.1587, 0.1994, 0.1962, 0.1480, 0.1201, 0.1775],
        [0.1608, 0.1947, 0.1920, 0.1505, 0.1264, 0.1756],
        [0.1554, 0.2091, 0.2047, 0.1423, 0.1084, 0.1802]])

### WHY DIVIDE BY SQRT(DIMENSION)

In [19]:
import torch 

# define tensor

tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

# apply softmax without scaling
softmax_result = torch.softmax(tensor , dim = -1)
print("Softmax without scaling:", softmax_result)


# multiply the tensor by 8 and then apply softmax
scaled_tensor = tensor * 8 
softmax_scaled_result = torch.softmax(scaled_tensor, dim = -1)
print("Softmax after scalling (tensor * 8):", softmax_scaled_result)

Softmax without scaling: tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
Softmax after scalling (tensor * 8): tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


In [20]:
import numpy as np

# function to compute variance before and after scaling
def compute_variance(dim, num_traials = 1000):
    dot_products = []
    scaled_dot_products = []

    # Generate multiple random vectors and compute dot products
    for _ in range(num_traials):
        q = np.random.randn(dim)
        k = np.random.randn(dim)

        # compute the dot product
        dot_product = np.dot(q,k)
        dot_products.append(dot_product)

        # scaled the dot product by sqrt(dim)
        scaled_dot_product = dot_product / np.sqrt(dim)
        scaled_dot_products.append(scaled_dot_product)

    # calculate the variance of the dot products
    variance_before_scaling = np.var(dot_products)
    variance_after_scaling = np.var(scaled_dot_products)

    return variance_before_scaling, variance_after_scaling

# for dimension 5
variance_bfore_5 , variance_after_5 = compute_variance(5)
print(f"Variance before scaling (dim=5): {variance_bfore_5}")
print(f"Variance after scaling (dim=5): {variance_after_5}")



# for dimension 20
variance_bfore_20 , variance_after_20 = compute_variance(20)
print(f"Variance before scaling (dim=20): {variance_bfore_20}")
print(f"Variance after scaling (dim=20): {variance_after_20}")

# for dimension 100
variance_bfore_100 , variance_after_100 = compute_variance(100)
print(f"Variance before scaling (dim=100): {variance_bfore_100}")
print(f"Variance after scaling (dim=100): {variance_after_100}")

Variance before scaling (dim=5): 5.083496975415383
Variance after scaling (dim=5): 1.0166993950830767
Variance before scaling (dim=20): 19.525007039526592
Variance after scaling (dim=20): 0.9762503519763293
Variance before scaling (dim=100): 105.0492227626223
Variance after scaling (dim=100): 1.0504922276262232


In [21]:
context_vector = attn_weights @ values
context_vector

tensor([[0.3003, 0.8070],
        [0.3068, 0.8225],
        [0.3065, 0.8218],
        [0.2956, 0.7960],
        [0.2934, 0.7909],
        [0.2998, 0.8060]])

### IMPLEMENTING A COMPACT SELF ATTENTION 

In [None]:
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    
    def __init__(self,d_in:int , d_out:int):
        super().__init__()
        self.w_q = nn.Parameter(torch.rand(d_in ,d_out))
        self.w_k = nn.Parameter(torch.rand(d_in ,d_out))
        self.w_v = nn.Parameter(torch.rand(d_in ,d_out))
    

    def forward(self , x):
        keys = x @ self.w_k
        queries = x @ self.w_q
        values = x @ self.w_v

        attn_scores = queries @ keys.T 
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5 , dim = -1
        )

        context_vec = attn_scores @ values

        return context_vec

In [27]:
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3400],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8100, 0.5500]])

In [30]:
sa_v1 = SelfAttention_v1(d_in= 3, d_out=2)

print(sa_v1.forward(inputs))

tensor([[2.0137, 1.8613],
        [3.7498, 3.4386],
        [3.7287, 3.4196],
        [2.1043, 1.9264],
        [2.2993, 2.1153],
        [2.4568, 2.2461]], grad_fn=<MmBackward0>)


In [35]:
import torch.nn as nn
class SelfAttention_v2(nn.Module):
    
    def __init__(self,d_in:int , d_out:int, qkv_bias = False):
        super().__init__()
        self.w_q = nn.Linear(d_in ,d_out, bias=qkv_bias)
        self.w_k = nn.Linear(d_in ,d_out, bias=qkv_bias)
        self.w_v = nn.Linear(d_in ,d_out, bias=qkv_bias)
    
    def forward(self , x):
        keys = self.w_k(x)
        queries = self.w_q(x)
        values = self.w_v(x)

        attn_scores = queries @ keys.T 
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5 , dim = -1
        )

        context_vec = attn_scores @ values

        return context_vec

In [37]:
sa_v2 = SelfAttention_v2(d_in= 3, d_out=2)

print(sa_v2(inputs))

tensor([[-0.0543, -0.0684],
        [-0.0750, -0.0909],
        [-0.0732, -0.0885],
        [-0.0441, -0.0540],
        [-0.0203, -0.0191],
        [-0.0641, -0.0809]], grad_fn=<MmBackward0>)
