In [1]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [3]:
## This is basically simple attention mechanism with trainable weights
## so intead of just taking dot product between the embedding vectors 
## of words we will have keys, query and value vectors which are obtained 
## by transforming the embedding by trainable weights 

In [4]:
d_in = 3 # input dimension of the embeddings 
d_out = 2 # output dimension for k,q,v,vectors [generally they are same]

In [8]:
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)


## here we are not training hence keeping requires_grad=False


In [9]:
print(W_query)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


In [10]:
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value

print(f"Keys: {keys.shape}")
print(f"Queries: {queries.shape}")
print(f"Values: {values.shape}")

## Now we will compute the attention weights



Keys: torch.Size([6, 2])
Queries: torch.Size([6, 2])
Values: torch.Size([6, 2])


In [11]:
attention_scores = queries @ keys.T

print(f"Attention Scores: {attention_scores.shape}")

print(attention_scores)

Attention Scores: torch.Size([6, 6])
tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


In [12]:
## Now these attention scores are first scaled by 1/sqrt(d_out) [todo: why?]

attention_scores = attention_scores / torch.sqrt(torch.tensor(d_out))

attention_scores = torch.softmax(attention_scores, dim=-1)

print(f"Attention Scores: {attention_scores.shape}")

print(attention_scores)

Attention Scores: torch.Size([6, 6])
tensor([[0.1551, 0.2104, 0.2059, 0.1413, 0.1074, 0.1799],
        [0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820],
        [0.1503, 0.2256, 0.2192, 0.1315, 0.0914, 0.1819],
        [0.1591, 0.1994, 0.1962, 0.1477, 0.1206, 0.1769],
        [0.1610, 0.1949, 0.1923, 0.1501, 0.1265, 0.1752],
        [0.1557, 0.2092, 0.2048, 0.1419, 0.1089, 0.1794]])


The softmax function is sensitive to the magnitudes of its inputs. When the inputs are large, the differences between the exponential values of each input become much more pronounced. This causes the softmax output to become "peaky," where the highest value receives almost all the probability mass, and the rest receive very little.

In attention mechanisms, particularly in transformers, if the dot products between query and key vectors become too large , the attention scores can become very large. This results in a very sharp softmax distribution, making the model overly confident in one particular "key." Such sharp distributions can make learning unstable,


 But still why is it divided by sqrt(d_out)?

 It turns out that the higher the dimension of the vector, the more is it's variance.

 And dividing by sqrt(d_out) makes the variance of the scores close to 1

In [17]:
## now we have the attention scores let's calculate the context vectors 

## so calculating context vectors is bascially in attention scores we have attention score 
## that each query has to give to all others and basically now we will scale the values of each query 
## wrt to their attention score and then sum it up so this way we have with us the enriched context vectors 
## for each query

context_vectors = attention_scores @ values

print(f"Attention shape: {context_vectors.shape}")

print(context_vectors)


Attention shape: torch.Size([6, 2])
tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]])


In [19]:
class SelfAttention_v1(torch.nn.Module):
    def __init__(self,d_in,d_out):
        super().__init__()
        self.W_query = torch.nn.Parameter(torch.rand(d_in,d_out))
        self.W_key = torch.nn.Parameter(torch.rand(d_in,d_out))
        self.W_value = torch.nn.Parameter(torch.rand(d_in,d_out))

    def forward(self,x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        attention_scores = queries @ keys.T
        attention_scores = torch.softmax(attention_scores / keys.shape[-1]**0.5,dim=-1)
        context_vectors = attention_scores @ values
        return context_vectors

In [20]:
torch.manual_seed(123)

self_attention = SelfAttention_v1(d_in=3,d_out=2)

print(self_attention(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)
