In [None]:
'''
Self Attention mechanism works with 3(Three) vectors, Query (Q), Key(K) and Value(V)
Query(Q): what I am looking for?
Key(K): What I can offer?
Value(V): What I actually offer? '''

'Self Attention mechanism works with 3(Three) vectors, Query (Q), Key(K) and Value(V)\nQuery(Q): what I am looking for?\nKey(K): What I can offer?\nValue(V): What I actually offer? '

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [None]:
import numpy as np
import math

In [None]:
# we are randomly creating some value for Q, K, V.

L , d_k, d_v = 4, 8, 8 # L is lenth of my input sequence, so here four words

'''Setting d_k, the same for Query and Key is a common practice
because it allows the dot product operation to be well-defined.
The dot product of two vectors is mathematically defined when
they have the same dimensionality. This symmetry ensures that
the attention mechanism can effectively capture relationships
between elements in the input sequence.'''

Q = np.random.randn(L,d_k) #row X column
K = np.random.randn(L,d_k)
V = np.random.randn(L,d_v)

print("Q\n", Q)
print("K\n", K)
print("V\n", V)

Q
 [[ 1.71085486 -0.03367367  1.09812443  1.09528995 -0.39000119 -0.45841432
  -0.48539386  1.5718894 ]
 [ 0.00965999  0.54801593  0.75797904  0.67755443  1.90984137  1.62987475
  -0.33656417  1.08604924]
 [ 0.12680977 -0.47276565 -1.48679567  0.19848653  0.19646147 -0.01184547
   1.69037273  0.47946585]
 [ 0.57315114 -0.63954299 -1.77893003 -0.22080153  0.34005573  0.51011648
   0.15552844  0.10861496]]
K
 [[-0.59194089 -0.81207564  1.46555827 -1.79502953  0.48312732 -0.86651849
  -1.86632717 -0.77304799]
 [ 0.83096345 -1.55802823  0.44207751  0.17283549 -0.4557931   0.51542019
   0.20530228  1.55508918]
 [ 2.32427521 -0.04541923  2.12979551 -0.72525274 -0.36016954 -0.771219
   1.25401108 -0.17451855]
 [-2.03744382  1.8421055   0.91399836 -0.97696761 -1.67673923  0.66197614
   0.34689897 -1.02507032]]
V
 [[-0.876697   -1.33901344 -0.77828713  0.00691392 -1.23706158 -1.20140183
  -0.12382458 -1.19784545]
 [ 0.09513252  0.94729865 -0.34992144  0.72716694  0.80692249  0.76361469
   1.716

In [None]:
'''The output above we got, each of the vector in Q, K & V is representing
each of the word with 8 X 1 vector.
For example,if we consider "I live in Montreal" sentence(L=4), for the word "i", we got the first vector
for Q [ 1.71085486 -0.03367367  1.09812443  1.09528995 -0.39000119 -0.45841432
  -0.48539386  1.5718894 ], K [-0.59194089 -0.81207564  1.46555827 -1.79502953  0.48312732 -0.86651849
  -1.86632717 -0.77304799] & V [-0.876697   -1.33901344 -0.77828713  0.00691392 -1.23706158 -1.20140183
  -0.12382458 -1.19784545]. SO the first vector of the matrix is in Q, K, V for "i".

In [None]:
""" we calculated the focus of each word on which other word,
mean, calculating the attention score. For example, for the first vector below,
the first word is focusing on the third word as in the first vector the third value
is highest 5.133"""

np.matmul(Q, K.T) # key - Transposed to match the matmul shape.

array([[-1.44252979,  4.43514305,  5.13343669, -5.04338883],
       [-1.25716568,  1.1957776 , -1.43594383, -2.33273404],
       [-5.64667572,  1.21598145, -1.01986374, -2.92444943],
       [-2.68266133,  0.95687194, -2.60722193, -4.04597725]])

In [None]:
#Why do we need the denominator part in the formula root"dk",
"""" we need this becuase this helps to reduce the variance of Q & K.T ""
lets check, the variance of 'np.matmul(Q, K.T).var()' is so high and not in same range"""

Q.var(), K.var(), np.matmul(Q, K.T).var()

(0.7526859311070544, 1.3816413704695116, 8.696591881898748)

In [None]:
""" so lets devide by root of dk and
 check the variance of scaled which pretty low than before and sam range"""
scaled = np.matmul(Q, K.T) / math.sqrt(d_k)
#check the variance now
Q.var(), K.var(), scaled.var()

(0.7526859311070544, 1.3816413704695116, 1.0870739852373434)

In [None]:
scaled

array([[-0.5100113 ,  1.56805986,  1.81494395, -1.78310722],
       [-0.44447519,  0.42277123, -0.50768281, -0.82474603],
       [-1.99640135,  0.42991437, -0.36057628, -1.03394901],
       [-0.94846401,  0.33830532, -0.92179215, -1.43046898]])

**Masking**
>Masking is needed for the decoder part of transformer, not for encoder.
>Its needed to ensure present words dont get context from the words to be generated in the future.

In [None]:
"""here we can see the first word only can see itself, can see next words
simillary for the other words as well"""
mask = np.tril(np.ones((L,L))) # created a triange martix
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [None]:
""" here we are placing the zero in place of 1 and -infinity in case of 0,
becuase, we want same value as scaled in case of 1 value and no information
in case of 0, as we will apply the softmax then the -inf will be become zero"""
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [None]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [None]:
scaled+mask

array([[-0.5100113 ,        -inf,        -inf,        -inf],
       [-0.44447519,  0.42277123,        -inf,        -inf],
       [-1.99640135,  0.42991437, -0.36057628,        -inf],
       [-0.94846401,  0.33830532, -0.92179215, -1.43046898]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [None]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis = -1)).T

In [None]:
""" As the softmax explain as probability, so sum of each row will be 1"""
attention = softmax(scaled)
attention

array([[0.0512946 , 0.40979482, 0.52454996, 0.01436062],
       [0.19988918, 0.47580567, 0.18764568, 0.13665948],
       [0.04982831, 0.56391251, 0.25580322, 0.13045596],
       [0.15960052, 0.57792451, 0.16391464, 0.09856034]])

In [None]:
""" As the softmax explain as probability, so sum of each row will be 1"""
attention_mask = softmax(scaled+mask)
attention_mask

array([[1.        , 0.        , 0.        , 0.        ],
       [0.29582759, 0.70417241, 0.        , 0.        ],
       [0.05730396, 0.64851518, 0.29418086, 0.        ],
       [0.15960052, 0.57792451, 0.16391464, 0.09856034]])

In [None]:
new_v = np.matmul(attention, V)
new_v

array([[-0.62826739,  0.25738151,  0.099283  , -0.36991247,  0.56932474,
         0.48257096, -0.15919262, -0.13640537],
       [-0.37438932,  0.07473878, -0.25217896, -0.05640966,  0.37412951,
         0.39665387,  0.6284082 , -0.36938407],
       [-0.31396822,  0.35634584, -0.12758447, -0.07010205,  0.6621748 ,
         0.66256466,  0.67821785, -0.20921284],
       [-0.29499245,  0.25275278, -0.26045559,  0.09470619,  0.45585986,
         0.45826315,  0.80681023, -0.35131703]])

In [None]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(Q, K, V, mask=None):
  d_k = Q.shape[-1]
  scaled = np.matmul(Q, K.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, V)
  return out, attention

In [None]:
values, attention = scaled_dot_product_attention(Q, K, V, mask=mask)
print("Q\n", Q)
print("K\n", K)
print("V\n", V)
print("New_v\n", values)
print("Attention\n", attention)

Q
 [[ 1.71085486 -0.03367367  1.09812443  1.09528995 -0.39000119 -0.45841432
  -0.48539386  1.5718894 ]
 [ 0.00965999  0.54801593  0.75797904  0.67755443  1.90984137  1.62987475
  -0.33656417  1.08604924]
 [ 0.12680977 -0.47276565 -1.48679567  0.19848653  0.19646147 -0.01184547
   1.69037273  0.47946585]
 [ 0.57315114 -0.63954299 -1.77893003 -0.22080153  0.34005573  0.51011648
   0.15552844  0.10861496]]
K
 [[-0.59194089 -0.81207564  1.46555827 -1.79502953  0.48312732 -0.86651849
  -1.86632717 -0.77304799]
 [ 0.83096345 -1.55802823  0.44207751  0.17283549 -0.4557931   0.51542019
   0.20530228  1.55508918]
 [ 2.32427521 -0.04541923  2.12979551 -0.72525274 -0.36016954 -0.771219
   1.25401108 -0.17451855]
 [-2.03744382  1.8421055   0.91399836 -0.97696761 -1.67673923  0.66197614
   0.34689897 -1.02507032]]
V
 [[-0.876697   -1.33901344 -0.77828713  0.00691392 -1.23706158 -1.20140183
  -0.12382458 -1.19784545]
 [ 0.09513252  0.94729865 -0.34992144  0.72716694  0.80692249  0.76361469
   1.716