In [2]:
import numpy as np
import pandas as pd
import math

In [3]:
# Here 
# L - Length of the input sequence (my name is vibhav)
# 8 - This is the size of each of the vector
# Every one will be of size - (4x8)
L, d_k, d_v = 4,8,8
# query vector - what am i looking for ?
q = np.random.randn(L, d_k)
# key vector - what can i offer ?
k = np.random.randn(L, d_k)
# value vector - what i actually offer?
v = np.random.randn(L, d_v)

In [4]:
print("Q\n",q)
print("K\n",k)
print("V\n",v)

Q
 [[ 0.2751041   0.4958448   1.51066295 -0.14032124 -1.70178345  1.27926355
   0.17624542 -0.59921872]
 [-0.28972129 -0.36650107  0.06336092 -0.77934791  0.03204285 -2.38194955
   0.29029974  0.09106863]
 [-0.33995996  0.65953047  0.83787033 -0.0877142   2.24189994 -1.25214464
  -1.11616416  0.3537038 ]
 [-0.25124162  1.18038437  1.01302541  1.19042449  0.25393742 -0.21432398
   0.81623457  0.37074504]]
K
 [[-0.83647365  0.22651162 -0.53878076 -0.08790401 -0.89791675 -0.61861813
  -1.21527165  0.22252373]
 [-1.45747415 -1.1259484  -0.48800815 -0.1256447   0.67235244  0.46567906
  -0.558623    0.01859098]
 [ 1.10837112 -0.8465118  -2.74098825 -1.44178702 -1.36898869 -2.25308163
  -1.23695635 -0.39828107]
 [-1.16720464  0.27535859  0.891918   -0.53092949  0.48584261  1.05658101
  -0.50072836 -0.84319769]]
V
 [[ 0.21586481  1.45565479 -0.19042048 -0.59953014  1.3611286  -0.6794735
  -0.24616001 -0.26306417]
 [ 1.45317943  1.66992393  0.1658293   0.46269172 -0.29970686 -1.4233359
  -0.186

In [5]:
# In order to create a SELF ATTENTION MATRIX, we needd every word to look at every single other word - To see whether it has higher affinity towards it or not either by using any of the vector embedding technique
# The above concept is represented by the Query(Q) - what i am looking for ?
np.matmul(q,k.T)

array([[-0.53022624, -2.33690479, -4.58513039,  2.17917594],
       [ 1.30591489, -0.34623213,  5.86661238, -2.01576871],
       [ 0.1867509 ,  0.90935888, -2.11340843,  1.39916299],
       [-1.17779106, -1.98495452, -6.79276289,  0.0653833 ]])

In [6]:
# why do we need sqrt(d_k) in the denominator - In order to minimize the variance
q.var(), k.var(), np.matmul(q,k.T).var()

(0.8679531007962409, 0.7981594453041116, 7.772860600917533)

In [7]:
scaled = np.matmul(q,k.T)/math.sqrt(d_k)
q.var(),k.var(),scaled.var()

(0.8679531007962409, 0.7981594453041116, 0.9716075751146913)

In [8]:
scaled

array([[-0.18746328, -0.82622061, -1.6210884 ,  0.77045504],
       [ 0.46171064, -0.12241154,  2.0741607 , -0.71268186],
       [ 0.06602641,  0.32150692, -0.74720272,  0.49467882],
       [-0.41641202, -0.7017874 , -2.40160435,  0.02311649]])

# Masking
* This is to ensure words don't get context from words generate in the future
* This concept won't be used in the encoders, but this will be helpful in the decoders
* So that we don't look at a future word while generating the current context
### Why it is not required for encoders
* As our inputs are passed simultaneously

In [10]:
# Why are we doing this so that 
# my can look at only itself 
# name can look at my name
# is can look at my name is
# vibhav can look at my name is vibhav
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [None]:
# Why (-inf)
# 1. Adding the matrices 
# 2. We are using the softmax function - so that all the values are squeezed between 0 to 1
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [12]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [13]:
# Anyways we are not going to consider any context from it - so we replace them to be (-inf)
scaled + mask
# This is because of the softmax operation which we will be going to perform next

array([[-0.18746328,        -inf,        -inf,        -inf],
       [ 0.46171064, -0.12241154,        -inf,        -inf],
       [ 0.06602641,  0.32150692, -0.74720272,        -inf],
       [-0.41641202, -0.7017874 , -2.40160435,  0.02311649]])

# Softmax
* This operation is used to convert vector into a prob distribution - so that their values add up to 1 and they are interpretable and stable

In [14]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

In [15]:
# This shouldn't be the case - because each row adds to 1
attention_diff = softmax(scaled)
attention_diff

array([[0.22869393, 0.12073852, 0.05453046, 0.59603709],
       [0.14531324, 0.08102595, 0.72875804, 0.04490276],
       [0.23420887, 0.30238282, 0.10385381, 0.35955449],
       [0.290608  , 0.21845934, 0.03991623, 0.45101644]])

In [16]:
# This is the right method - cause as you can see for the first row it is 1.
# similarly for the next row first two columns add to 1
# similarly for the next row first three columns add to 1
# for the last row all the 4 columns add to 1
attention = softmax(scaled + mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.64201537, 0.35798463, 0.        , 0.        ],
       [0.3656968 , 0.4721445 , 0.1621587 , 0.        ],
       [0.290608  , 0.21845934, 0.03991623, 0.45101644]])

In [17]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.21586481,  1.45565479, -0.19042048, -0.59953014,  1.3611286 ,
        -0.6794735 , -0.24616001, -0.26306417],
       [ 0.65880443,  1.53235985, -0.06288854, -0.21927104,  0.76657503,
        -0.94576481, -0.22488046, -0.31817652],
       [ 0.69123078,  1.16698786,  0.0163807 ,  0.07353868,  0.3873938 ,
        -1.00325857, -0.34850334, -0.2663579 ],
       [ 0.32752571,  0.28260301,  0.47698963,  0.08972168,  0.2645779 ,
        -0.20508967, -0.55484926,  0.32459222]])

In [18]:
v

array([[ 0.21586481,  1.45565479, -0.19042048, -0.59953014,  1.3611286 ,
        -0.6794735 , -0.24616001, -0.26306417],
       [ 1.45317943,  1.66992393,  0.1658293 ,  0.46269172, -0.29970686,
        -1.4233359 , -0.18671738, -0.41701589],
       [-0.45523907, -0.94836636,  0.04761672,  0.45836321,  0.19202407,
        -0.51034611, -1.05036504,  0.16487293],
       [-0.07648394, -1.03627527,  1.09574655,  0.32055258, -0.16222928,
         0.71767422, -0.88820804,  1.07659199]])

# Combined Code

In [19]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

def scaled_dot_product_attention(q,k,v,mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q,k.T)/math.sqrt(d_k)
  if mask is not None:
    scaled = scaled  + mask
  attention = softmax(scaled)
  out = np.matmul(attention,v)
  return out,attention

In [20]:
# This is for the Encoder
values,attention = scaled_dot_product_attention(q,k,v,mask=None)
print("Q\n",q)
print("K\n",k)
print("V\n",v)
print("New V\n",values)
print("Attention\n",attention)

Q
 [[ 0.2751041   0.4958448   1.51066295 -0.14032124 -1.70178345  1.27926355
   0.17624542 -0.59921872]
 [-0.28972129 -0.36650107  0.06336092 -0.77934791  0.03204285 -2.38194955
   0.29029974  0.09106863]
 [-0.33995996  0.65953047  0.83787033 -0.0877142   2.24189994 -1.25214464
  -1.11616416  0.3537038 ]
 [-0.25124162  1.18038437  1.01302541  1.19042449  0.25393742 -0.21432398
   0.81623457  0.37074504]]
K
 [[-0.83647365  0.22651162 -0.53878076 -0.08790401 -0.89791675 -0.61861813
  -1.21527165  0.22252373]
 [-1.45747415 -1.1259484  -0.48800815 -0.1256447   0.67235244  0.46567906
  -0.558623    0.01859098]
 [ 1.10837112 -0.8465118  -2.74098825 -1.44178702 -1.36898869 -2.25308163
  -1.23695635 -0.39828107]
 [-1.16720464  0.27535859  0.891918   -0.53092949  0.48584261  1.05658101
  -0.50072836 -0.84319769]]
V
 [[ 0.21586481  1.45565479 -0.19042048 -0.59953014  1.3611286  -0.6794735
  -0.24616001 -0.26306417]
 [ 1.45317943  1.66992393  0.1658293   0.46269172 -0.29970686 -1.4233359
  -0.186

In [21]:
# This is for the Decoder
values,attention = scaled_dot_product_attention(q,k,v,mask=mask)
print("Q\n",q)
print("K\n",k)
print("V\n",v)
print("New V\n",values)
print("Attention\n",attention)

Q
 [[ 0.2751041   0.4958448   1.51066295 -0.14032124 -1.70178345  1.27926355
   0.17624542 -0.59921872]
 [-0.28972129 -0.36650107  0.06336092 -0.77934791  0.03204285 -2.38194955
   0.29029974  0.09106863]
 [-0.33995996  0.65953047  0.83787033 -0.0877142   2.24189994 -1.25214464
  -1.11616416  0.3537038 ]
 [-0.25124162  1.18038437  1.01302541  1.19042449  0.25393742 -0.21432398
   0.81623457  0.37074504]]
K
 [[-0.83647365  0.22651162 -0.53878076 -0.08790401 -0.89791675 -0.61861813
  -1.21527165  0.22252373]
 [-1.45747415 -1.1259484  -0.48800815 -0.1256447   0.67235244  0.46567906
  -0.558623    0.01859098]
 [ 1.10837112 -0.8465118  -2.74098825 -1.44178702 -1.36898869 -2.25308163
  -1.23695635 -0.39828107]
 [-1.16720464  0.27535859  0.891918   -0.53092949  0.48584261  1.05658101
  -0.50072836 -0.84319769]]
V
 [[ 0.21586481  1.45565479 -0.19042048 -0.59953014  1.3611286  -0.6794735
  -0.24616001 -0.26306417]
 [ 1.45317943  1.66992393  0.1658293   0.46269172 -0.29970686 -1.4233359
  -0.186