# Coding Attention Mechanisms

### Attending to different parts of the input with self-attention

In [None]:
import torch

In [None]:
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89], # Your     (x^1)
        [0.55, 0.87, 0.66], # journey  (x^2)
        [0.57, 0.85, 0.64], # starts   (x^3)
        [0.22, 0.58, 0.33], # with     (x^4)
        [0.77, 0.25, 0.10], # one      (x^5)
        [0.05, 0.80, 0.55]
   ]
)

In [None]:
inputs.shape

In [None]:
# We compute attention scores by applying dot product against each other element for a query
query = inputs[1]   # "journey"
attn_scores_2 = torch.empty(inputs.shape[0])  # Faster than torch.zeros
for idx, query_i in enumerate(inputs):
    attn_scores_2[idx] = torch.dot(query, query_i)

print("Attention Scores:", attn_scores_2)
print("Sum:", attn_scores_2.sum())

> **Nota importante:** Si nos fijamos, la operación de producto escalar es la misma a nivel efectivo para todas las posiciones de los input embeds. **Es por ello que, si queremos portar algo de información posicional se necesita usar una entrada con embeddings posicionales (relativos o absolutos)**.

In [None]:
# We then normalize the scores for trainig stability (remember vanishing or exploding activations in deep nets)
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
print("Attention Weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

In [None]:
# Better to use softmax for extreme values. Specifically better to use pytorch imp
# attn_weights = torch.softmax(attn_scores, dim=0) # Sames as below
attn_weights_2 = attn_scores_2.softmax(dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

In [None]:
inputs.shape[1] # Columns

In [None]:
# Context vector will be the weighed sum of input embeddings based on attention scores:
context_vector = torch.zeros(inputs.shape[1])
for idx, input in enumerate(inputs):
    context_vector += input * attn_weights_2[idx]

context_vector

> We have to compute this context vector for each input embedding. Since we are computing first the attention scores, we can calculate this as matrix multiplications

In [None]:
attn_scores = inputs @ inputs.T
attn_scores

In [None]:
# Then for normalization
attn_weights = attn_scores.softmax(dim=1)
attn_weights

In [None]:
# Finally, to get the context vectors
context_vectors = attn_weights @ inputs
context_vectors

### Self attention with trainable weights

#### Computing attention weights step by step

In [None]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2

In [None]:
# Init Wq, Wk and Wv matrices
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [None]:
query_2 = x_2 @ W_query
key_2   = x_2 @ W_key
value_2 = x_2 @ W_value
query_2, key_2 ,value_2

In [None]:
keys = inputs @ W_key
values = inputs @ W_value
print("Keys shape: ", keys.shape)
print("Values shape: ", keys.shape)

In [None]:
# Next we compute the attention score by performing dot product of our query value against each key value
attn_scores_2 = query_2 @ keys.T
attn_scores_2

In [None]:
# Now we want the attention weights. In this case, instead of using softmax directly, first we scale by the square root of the embedding dimension of the keys.
d_k = keys.shape[1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=0)
attn_weights_2

#### Implementing a self-attention Python class

In [None]:
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query    = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key      = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value    = nn.Parameter(torch.rand(d_in, d_out))
    
    def forward(self, x):
        keys    = x @ self.W_key
        queries = x @ self.W_query
        values  = x @ self.W_value
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=1
        )

        return attn_weights @ values # context vector

In [None]:
# Use of the class:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

In [None]:
# Optimization using Linear layers (without bias those are basically matrix mult)
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query    = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key      = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value    = nn.Linear(d_in, d_out, bias=qkv_bias)
    
    def forward(self, x):
        keys    = self.W_key(x)
        queries = self.W_query(x)
        values  = self.W_value(x)
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=1
        )

        return attn_weights @ values # context vector

In [None]:
# Use of the class:
torch.manual_seed(789)
sa_v2 = SelfAttention_v1(d_in, d_out)
print(sa_v2(inputs))