In [None]:
import numpy as np
import math

### Defining the query, key and value vectors and their dimensions. In the "Attention is All you need" paper, the actual dimensions of Wq, Wk and Wv matrices are, 512 x 64 becasue the I/P embeddings vectors are of dimension 512. So, if we have 4 words, I/P will have a dimension of 4 x 512. After these Self Attention layers, expected dimensions of the query, key and value vectors are 1 x 64. Hence, for all the input words together in the form of a O/P matrix, it will be 4 X 64

### For understanding the mechanism, we do not consider the matrices WQ, WK and WV. Rather we directly randomly initialize the query, key and value vectors in the form of 1 X 8 dimension. Since, we have 4 words in the I/P sentence, it these will look like 4 x 8 matrices.

In [None]:
L = 4  # Number of words in the I/P sentence

# Dimensions of the vectors
d_k = 8
d_v = 8

# Random Initialization of vectors
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [None]:
q

array([[-0.73295046, -1.01856123, -0.05692238,  0.84549785,  0.73518234,
         0.28877697, -0.68057301,  1.60372692],
       [-0.59850236,  1.31942612,  1.07497577,  0.27911664, -1.43302758,
         0.63542193, -0.11069461, -0.46895461],
       [-2.51227147,  1.28685185,  0.41192541, -0.40138432, -1.1682551 ,
        -0.99897481, -1.67014039, -1.33189188],
       [-0.4541441 ,  0.49158284, -1.07186903, -0.36634175,  0.38987809,
        -0.80854428,  0.33013949,  0.80924413]])

In [None]:
k

array([[-0.79680821, -1.00435721, -0.08761221, -0.17929451, -0.61332812,
        -2.03886942,  0.68464872, -0.21430272],
       [ 1.05226059, -0.18622262,  0.7634342 ,  0.56260192,  0.07880734,
        -0.57725068,  0.3289039 ,  1.33388147],
       [-0.16095804, -0.09937703,  0.06683818, -0.79259057,  0.6560552 ,
         0.45312437,  0.77328347,  0.74865733],
       [ 1.00119153,  1.76478707,  0.15744213, -1.33803559, -1.58144575,
        -1.76573614,  0.5914121 ,  0.01702544]])

In [None]:
v

array([[-0.24141684, -2.17324842,  0.42929517, -1.64532319,  0.65945414,
         0.13581085,  2.5898868 , -1.92892245],
       [-0.21441448, -1.52829474,  0.60023029,  1.00615942,  0.99322276,
         0.85164205,  0.38279799, -1.13031028],
       [-1.05493284, -0.23369413,  1.29379467, -1.2126394 ,  0.32959178,
        -1.23363966, -1.06027237, -0.46822239],
       [ 0.35938068, -1.09129034,  1.23356365, -0.51658256, -1.06698389,
         0.16209556, -0.41461667, -0.15328363]])

## Self Attention Mechanism

### Huge crux of Attention is the matrix multiplication of query vector and key vector. It rolls out to be the multiplication of "What I want?" and "What I can offer?"

In [None]:
q @ k.T

array([[-0.38890882,  1.65722606,  0.83279578, -5.71938555],
       [-1.38442352, -1.0394512 , -1.27306734,  2.59588676],
       [ 2.64048738, -4.6358566 , -2.8855697 ,  2.95871622],
       [ 1.48973224,  0.09165008,  0.99351174,  1.75440582]])

### Scaling these value by square root of d_k to bring these values to have a variance of ~1

In [None]:
scaled = q @ k.T / math.sqrt(d_k)

In [None]:
scaled

array([[-0.13750003,  0.58591789,  0.29443777, -2.02210815],
       [-0.48946763, -0.3675015 , -0.45009727,  0.91778457],
       [ 0.93355327, -1.63902282, -1.02020295,  1.04606415],
       [ 0.52669988,  0.0324032 ,  0.35125944,  0.62027613]])

### Masking for the decoder!

In [None]:
mask = np.tril(np.ones( (L, L) ))

In [None]:
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [None]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [None]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [None]:
scaled + mask

array([[-0.13750003,        -inf,        -inf,        -inf],
       [-0.48946763, -0.3675015 ,        -inf,        -inf],
       [ 0.93355327, -1.63902282, -1.02020295,        -inf],
       [ 0.52669988,  0.0324032 ,  0.35125944,  0.62027613]])

In [None]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis = 1)).T

In [None]:
def scaledDotProductAttention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)

  if mask is not None:
    scaled = scaled + mask

  attention = softmax(scaled)
  out = np.matmul(attention, v)

  return out, attention

In [None]:
values, attention = scaledDotProductAttention(q, k, v, mask=None)

In [None]:
values

array([[-0.24141684, -2.17324842,  0.42929517, -1.64532319,  0.65945414,
         0.13581085,  2.5898868 , -1.92892245],
       [-0.22709334, -1.83113029,  0.51996835, -0.23883418,  0.83650297,
         0.51552623,  1.41912818, -1.50529559],
       [-0.3343886 , -1.90713389,  0.54060457, -1.42880243,  0.64198774,
         0.02131811,  2.02681827, -1.70889937],
       [-0.24322313, -1.2685952 ,  0.91216408, -0.7375808 ,  0.1043722 ,
        -0.0568967 ,  0.41679405, -0.89637671]])

In [None]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.46954621, 0.53045379, 0.        , 0.        ],
       [0.82096461, 0.06267132, 0.11636407, 0.        ],
       [0.28191397, 0.17196746, 0.23655045, 0.30956813]])