# Attention in Transformers

## There are three parameters:-

- Q (Query Vector) :- What i am looking for? (d_q or dimension_of_query )
- K (Key Vector) :- What i can offer ? (d_k or dimension_of_keys)
- V (Value Vector) :- What i am offering ? (d_v or dimension_of_value)


In [1]:
import numpy as np
import math
from typing import List


sequence_length, dimension_of_keys, dimension_of_values = 4, 8, 8
# np.random.seed(seed=43)
query = np.random.randn(sequence_length, dimension_of_keys)
key = np.random.randn(sequence_length, dimension_of_keys)
value = np.random.randn(sequence_length, dimension_of_values)

print(f"Q (Query vector):\n {query} \n")
print(f"K (Key vector):\n {key} \n")
print(f"V (Value vector):\n {value} \n")

Q (Query vector):
 [[-0.84255448  1.69118479  0.16840026 -0.33852743 -0.86381845  0.87873148
  -0.43893728  1.43357632]
 [-0.36916007  0.99008703  1.26395118  1.25426793 -1.14313892  0.44420488
  -1.79295722  0.08786969]
 [-0.93264763  2.10190512 -2.28282036 -0.28115302  0.16496563 -3.86691182
  -0.36078562 -0.33304909]
 [ 0.70902575  0.90507036 -0.612227    0.29584081 -1.27537195 -0.53748187
   1.310467   -1.05024303]] 

K (Key vector):
 [[-0.13831726  1.61754182 -0.38570355 -0.06724541 -0.32901251  0.43812086
   0.83568471 -0.88561807]
 [ 0.44164728 -1.12268517 -0.8300944  -0.34109228  0.3553728  -0.61233613
  -1.15187035  0.1823838 ]
 [-0.55582771  0.5160848  -0.31469653  2.05612718 -1.4826562  -2.49664229
   1.7230993   1.99282134]
 [-0.05107385 -1.41396277  0.06189906 -0.12581689  1.04094493 -0.03749632
   0.53257072  0.88856998]] 

V (Value vector):
 [[ 0.30484211  1.62034263  0.36740884 -0.98962018  0.09368226 -1.75405572
   0.03991657 -0.06351192]
 [-0.4504077  -0.10019523 -0.9

## Self-Attention

### ![""](https://jalammar.github.io/images/t/self-attention-matrix-calculation-2.png)


In [2]:
# ! Solving inside the softmax
np.matmul(query, key.T)

array([[ 1.84269718, -2.37309569,  1.77945603, -2.18729245],
       [ 0.07527066, -1.34857736,  0.56885814, -3.54406134],
       [ 2.67331963,  2.00048616,  9.86779345, -3.20167848],
       [ 3.7915456 , -2.12082091,  4.27190508, -2.93380296]])

In [3]:
# But why are we diving the equation by the square root of dimension of the keys ?
query.var(), key.var(), np.matmul(query, key.T).var()

(1.5125745535077024, 1.0357473030624733, 11.93390494969697)

In [4]:
# just to reduce the variance and to stabilize the values of query and key vector
scaled = np.matmul(query, key.T)/math.sqrt(dimension_of_keys)
query.var(), key.var(), scaled.var()

(1.5125745535077024, 1.0357473030624733, 1.4917381187121208)

In [5]:
scaled

array([[ 0.65149184, -0.83901603,  0.62913271, -0.77332466],
       [ 0.0266122 , -0.4767941 ,  0.20112173, -1.2530149 ],
       [ 0.94516122,  0.70727867,  3.48879183, -1.13196428],
       [ 1.3405138 , -0.74982342,  1.51034652, -1.03725599]])

## Masking

- Used to prevent to get the context from the words that will be generated in future
- no required encoder cause we are providing the input simultaneously


In [6]:
masking = np.tril(np.ones((sequence_length, sequence_length)))
masking

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [7]:
# masking[masking==0]= "-inf"
# masking[masking==1]= 0
# masking

### OR


In [8]:
masking = np.where(masking == 0, -np.inf, 0)
masking

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [9]:
scaled+masking

array([[ 0.65149184,        -inf,        -inf,        -inf],
       [ 0.0266122 , -0.4767941 ,        -inf,        -inf],
       [ 0.94516122,  0.70727867,  3.48879183,        -inf],
       [ 1.3405138 , -0.74982342,  1.51034652, -1.03725599]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$


In [10]:
def softmax(x: List[float]) -> List[float]:
    return (np.exp(x).T/np.sum(np.exp(x), axis=-1)).T
# lambda x:(np.exp(x).T/np.sum(np.exp(x),axis=1)).T

In [11]:
# Applying Attention
attention = softmax(scaled+masking)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.62325949, 0.37674051, 0.        , 0.        ],
       [0.06889859, 0.05431243, 0.87678898, 0.        ],
       [0.41640481, 0.05148655, 0.49348409, 0.03862455]])

In [12]:
# but if i remove the masking
# attention=softmax(scaled)
# attention

# here all the values in the row are summing upto 1 because this a praobability distribution ,which we dont want.

In [13]:
# So this is encapsulating more of context of the words
new_values = np.matmul(attention, value)
new_values

array([[ 0.30484211,  1.62034263,  0.36740884, -0.98962018,  0.09368226,
        -1.75405572,  0.03991657, -0.06351192],
       [ 0.02030891,  0.97214632, -0.1386252 , -0.32711608, -0.27146733,
        -1.27070736,  0.2434847 , -0.01481258],
       [-0.57839079,  0.36992445, -0.56608594, -0.97314918,  0.97780242,
         0.03238621, -0.26283137,  1.59387267],
       [-0.18902735,  0.86657486, -0.19833068, -0.89469337,  0.58802447,
        -0.60829466, -0.11603942,  0.85752181]])

In [14]:
value

array([[ 0.30484211,  1.62034263,  0.36740884, -0.98962018,  0.09368226,
        -1.75405572,  0.03991657, -0.06351192],
       [-0.4504077 , -0.10019523, -0.97578105,  0.76889551, -0.87555141,
        -0.47108151,  0.580257  ,  0.06575303],
       [-0.65572362,  0.30078728, -0.61406206, -1.07976532,  1.16208255,
         0.20395302, -0.33884634,  1.81876984],
       [ 0.79778368,  1.25778452,  0.05043346,  0.27569132,  0.53397663,
         1.18342807,  0.12113994, -0.43886321]])

### Binding Evrything into a function

![""](https://production-media.paperswithcode.com/methods/35184258-10f5-4cd0-8de3-bd9bc8f88dc3.png)


In [15]:
class Self_Attention:
    def __init__(self, length: List[float], q: List[float], k: List[float], v: List[float], mask: List[float]) -> None:
        self.query = q
        self.key = k
        self.value = v
        self.mask = mask
        self.length = length

    def softmax(self, x: List[float]) -> List[float]:
        return (np.exp(x).T/np.sum(np.exp(x), axis=-1)).T

    def scaled_dot_product_attention(self):
        dimension_of_keys = self.query.shape[-1]
        scaled = np.matmul(self.query, self.key.T)/math.sqrt(dimension_of_keys)
        if self.mask is not None:
            scaled = scaled+self.mask
        attention = softmax(scaled)
        output = np.matmul(attention, self.value)

        return attention, output

In [21]:
test = Self_Attention(length=sequence_length, q=query,
                      v=value, k=key, mask=masking)
attention, output = test.scaled_dot_product_attention()
print("\nAttention Scores:\n")
print(attention)
print("\nOutput:\n")
print(output)


Attention Scores:

[[1.         0.         0.         0.        ]
 [0.62325949 0.37674051 0.         0.        ]
 [0.06889859 0.05431243 0.87678898 0.        ]
 [0.41640481 0.05148655 0.49348409 0.03862455]]

Output:

[[ 0.30484211  1.62034263  0.36740884 -0.98962018  0.09368226 -1.75405572
   0.03991657 -0.06351192]
 [ 0.02030891  0.97214632 -0.1386252  -0.32711608 -0.27146733 -1.27070736
   0.2434847  -0.01481258]
 [-0.57839079  0.36992445 -0.56608594 -0.97314918  0.97780242  0.03238621
  -0.26283137  1.59387267]
 [-0.18902735  0.86657486 -0.19833068 -0.89469337  0.58802447 -0.60829466
  -0.11603942  0.85752181]]
