### IMPLEMENTING SELF ATTENTION WITH TRAINABLE WEIGHTS

In [3]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your      (x^1)
     [0.55, 0.87, 0.66],  # journey   (x^2)
     [0.57, 0.85, 0.64],  # starts    (x^3)
     [0.22, 0.58, 0.34],  # with      (x^4)
     [0.77, 0.25, 0.10],  # one       (x^5)
     [0.05, 0.81, 0.55]]  # step      (x^6)
 )

In [4]:
x_2 =  inputs[1]

d_in = inputs.shape[1]
d_out = 2

<div class="alert alert-block alert-success">
Next we initialize the three weight matrices Wq, Wk, Wv
</div>

In [5]:
import torch

torch.manual_seed(123)

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad= False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad= False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad= False)

In [6]:
print(W_query)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


In [7]:
print(W_key)

Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])


In [8]:
print(W_value)

Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [9]:
queries = inputs @ W_query
key = inputs @ W_key
value = inputs @ W_value

In [10]:
queries

tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2363, 0.8076],
        [0.2983, 0.6565],
        [0.2593, 1.0601]])

In [11]:
key

tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2440, 0.6775],
        [0.1827, 0.3292],
        [0.3293, 0.9715]])

In [12]:
value

tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2405, 0.5576],
        [0.1492, 0.3346],
        [0.3253, 0.7903]])

<div class="alert alert-block alert-success">
First Let's Compute Attention score w22
</div>

In [13]:
Keys_2 = key[1]
attn_score_22 = queries[1].dot(Keys_2)
print(attn_score_22)

tensor(1.8524)


<div class="alert alert-block alert-success">
Again, we can generalise this compution to all attention scores via matrix multiplication
</div>

In [14]:
attn_score_2 = queries[2] @ key.T
print(attn_score_2)

tensor([1.2544, 1.8284, 1.7877, 1.0767, 0.5508, 1.5350])


In [15]:
attn_score = queries @ key.T
print(attn_score)

tensor([[0.9231, 1.3545, 1.3241, 0.7993, 0.4032, 1.1413],
        [1.2705, 1.8524, 1.8111, 1.0909, 0.5577, 1.5553],
        [1.2544, 1.8284, 1.7877, 1.0767, 0.5508, 1.5350],
        [0.7042, 1.0269, 1.0040, 0.6048, 0.3091, 0.8624],
        [0.6114, 0.8819, 0.8626, 0.5176, 0.2707, 0.7360],
        [0.9057, 1.3255, 1.2958, 0.7815, 0.3964, 1.1153]])


In [16]:
d_k = key.shape[-1]
attn_weight_2 = torch.softmax(attn_score_2/ d_k ** 0.5, dim = -1)
print(attn_weight_2)
print(d_k)

tensor([0.1500, 0.2250, 0.2187, 0.1323, 0.0912, 0.1829])
2


### WHY to Devide by SQRT(Dimension)

In [17]:
import torch 

# define tensor

tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

# apply softmax without scaling
softmax_result = torch.softmax(tensor , dim = -1)
print("Softmax without scaling:", softmax_result)


# multiply the tensor by 8 and then apply softmax
scaled_tensor = tensor * 8 
softmax_scaled_result = torch.softmax(scaled_tensor, dim = -1)
print("Softmax after scalling (tensor * 8):", softmax_scaled_result)

Softmax without scaling: tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
Softmax after scalling (tensor * 8): tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


In [18]:
import numpy as np

# function to compute variance before and after scaling
def compute_variance(dim, num_traials = 1000):
    dot_products = []
    scaled_dot_products = []

    # Generate multiple random vectors and compute dot products
    for _ in range(num_traials):
        q = np.random.randn(dim)
        k = np.random.randn(dim)

        # compute the dot product
        dot_product = np.dot(q,k)
        dot_products.append(dot_product)

        # scaled the dot product by sqrt(dim)
        scaled_dot_product = dot_product / np.sqrt(dim)
        scaled_dot_products.append(scaled_dot_product)

    # calculate the variance of the dot products
    variance_before_scaling = np.var(dot_products)
    variance_after_scaling = np.var(scaled_dot_products)

    return variance_before_scaling, variance_after_scaling



# for dimension 5
variance_bfore_5 , variance_after_5 = compute_variance(5)
print(f"Variance before scaling (dim=5): {variance_bfore_5}")
print(f"Variance after scaling (dim=5): {variance_after_5}")



# for dimension 20
variance_bfore_20 , variance_after_20 = compute_variance(20)
print(f"Variance before scaling (dim=20): {variance_bfore_20}")
print(f"Variance after scaling (dim=20): {variance_after_20}")

# for dimension 100
variance_bfore_100 , variance_after_100 = compute_variance(100)
print(f"Variance before scaling (dim=100): {variance_bfore_100}")
print(f"Variance after scaling (dim=100): {variance_after_100}")

Variance before scaling (dim=5): 4.943243981009453
Variance after scaling (dim=5): 0.9886487962018904
Variance before scaling (dim=20): 20.78442941345748
Variance after scaling (dim=20): 1.039221470672874
Variance before scaling (dim=100): 95.71363018840235
Variance after scaling (dim=100): 0.9571363018840234


<div class="alert alert-block alert-success">
We now compute the context vector as a weighted sum over the value vectors
</div>

In [26]:
context_vector_2 = attn_weight_2 @ value
print(context_vector_2)

tensor([0.3065, 0.8218])


## Implementing a compact self attention class

In [30]:
import torch.nn as nn

class SelfAttention_V1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))
    

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5 , dim = -1
        )

        context_vector = attn_weights @ values
        return context_vector

In [31]:
torch.manual_seed(123)
sa_v1 = SelfAttention_V1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.3003, 0.8070],
        [0.3068, 0.8225],
        [0.3065, 0.8218],
        [0.2956, 0.7960],
        [0.2934, 0.7909],
        [0.2998, 0.8060]], grad_fn=<MmBackward0>)


In [37]:
import torch.nn as nn

class SelfAttention_V2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out,bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out,bias=qkv_bias)
    

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5 , dim = -1
        )

        context_vector = attn_weights @ values
        return context_vector

In [41]:
torch.manual_seed(789)
sa_v2 = SelfAttention_V2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.0744,  0.0709],
        [-0.0753,  0.0699],
        [-0.0753,  0.0698],
        [-0.0765,  0.0681],
        [-0.0768,  0.0675],
        [-0.0759,  0.0689]], grad_fn=<MmBackward0>)
