In [1]:
import numpy as np 
import math

In [3]:
# L - Length of the input sequence 
# d_k,d_v - Size of each of the vector for illustrative purpose it's 8
L,d_k,d_v = 4,8,8
q = np.random.randn(L,d_k)
k = np.random.randn(L,d_k)
v = np.random.randn(L,d_v)

In [6]:
# Using the randn function - values are assigned from the normal distribution 
print("Q\n",q) # 8 x 1 - for Every single word - Totally 8 x 4
print("K\n",k) # 8 x 1 - for Every single word - Totally 8 x 4
print("V\n",v) # 8 x 1 - for Every single word - Totally 8 x 4

Q
 [[ 0.44008364 -0.65893923 -0.82743603  2.07134499 -0.756638    0.18606377
  -2.52939241 -0.99671064]
 [ 0.45847306  0.13226699  0.39716341 -0.35557593  0.92675916  0.86731148
  -1.0848844  -0.75466212]
 [-1.73679108  0.50542825  0.67550296 -0.48343931 -0.78572117 -0.57689201
  -0.52532605 -1.41217445]
 [-2.09387521 -0.89363413 -0.59569919 -1.71644659 -0.36390696 -0.30201351
   0.22337068 -0.50049467]]
K
 [[-0.97577445  0.8762965  -2.38211946 -1.7001989  -2.1416398   0.14183991
   0.82795221  0.06947477]
 [ 0.75104729  0.51849052  0.78832873  0.55552962  1.34575561  0.47145897
   0.51505236  1.26781612]
 [-0.43040316  1.29402735 -0.45535669 -0.2645018   0.40588437 -0.22652967
   0.51160386 -1.89667442]
 [ 0.13683429 -1.27318423 -0.18401422  1.90297292 -0.61696554  1.93138142
  -0.68380964  0.23601658]]
V
 [[ 1.5062644  -1.35215475  0.48031349 -0.28786724 -0.40148487  2.23991922
  -2.90143267 -1.0343477 ]
 [ 1.38140753 -0.31255013  1.31883032 -0.84232214 -2.56852263  0.81663448
   1.7

# For the Formula of Self Attention - Refer to the Notes

In [8]:
# Self Attention - Initially from the encoder side we need all the words to look at other words so that we can check the similarity/affinity with each other
# Coming to the First Step 
np.matmul(q,k.T)
# for each row we have to look at the max value - that's the one each row will focus on 
# my - this will focus on the 3rd index
# name - this will focus on the 2nd index
# is - this will focus on the 2nd index 
# Vibhav - this will focus on the 1st index

array([[-3.07412046, -3.00967196, -0.96606258,  7.31370534],
       [-3.48542943,  0.66902454,  0.94303088,  0.81167197],
       [ 2.41828564, -4.16871647,  3.44328164, -2.5289391 ],
       [ 6.48407724, -4.61068825,  1.45434592, -2.93514059]])

In [11]:
# Why do we need the denominator - SquareRoot(d_k)
# Answer - In order to minimize the variance and hence stablize the values of the multiplication(q,k.T) vector
q.var(), k.var(), np.matmul(q,k.T).var()
# As you can see below the difference of the first two values is comparitively less 
# Whereas the last value is far way distant from the first two values

(0.8851675882015266, 1.1668235844412789, 12.65558917253314)

In [13]:
# In order to make sure we stablize the values and we reduce the variance - we use the SquareRoot(dk)
scaled = np.matmul(q,k.T)/math.sqrt(d_k)
q.var(),k.var(),scaled.var()
# As you can see now the variance is much more in the same range

(0.8851675882015266, 1.1668235844412789, 1.581948646566643)

In [14]:
# now you can see the scaled vector will have much more of lower values - which will be of the same range 
scaled

array([[-1.08686571, -1.06407973, -0.3415547 ,  2.58578532],
       [-1.23228539,  0.2365359 ,  0.33341177,  0.28696938],
       [ 0.85499309, -1.47386384,  1.2173839 , -0.89411499],
       [ 2.29246749, -1.63012447,  0.51418893, -1.03772891]])

# Masking 
* This is required specifically in the decoding part of the transformer - so that we don't look at a future word while generating a current word 
* For the **Encoder** Masking is not required - **Cause all the inputs are passed simultaneously**

In [18]:
# This will generate a lower triangular matrix
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [19]:
mask[mask==0] = -np.infty
mask[mask==1] = 0

In [21]:
# Transform every single 0 to -inf
# Transform every single 1 to 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [23]:
# why 0 - because we are adding so that if so the value is 0, then we will get the exact values from the scaled to the masked one - there will not be any change in the values
scaled + mask 

array([[-1.08686571,        -inf,        -inf,        -inf],
       [-1.23228539,  0.2365359 ,        -inf,        -inf],
       [ 0.85499309, -1.47386384,  1.2173839 ,        -inf],
       [ 2.29246749, -1.63012447,  0.51418893, -1.03772891]])

# Softmax - Refer the notes for the formula
* This is used to convert a vector into a probability distribution
* So that their values add upto 1 - they are very interpretable and also very stable

In [36]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

In [37]:
attention = softmax(scaled+mask)

In [38]:
# Each row will add upto 1 here for this case
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.18712184, 0.81287816, 0.        , 0.        ],
       [0.39460693, 0.03843749, 0.56695558, 0.        ],
       [0.81665685, 0.01616142, 0.13795678, 0.02922494]])

In [39]:
# If you don't apply the mask then - every row is going to add upto 1 - Cause it is a probability distribution
attention_new = softmax(scaled)
attention_new

array([[0.02299578, 0.02352577, 0.04845432, 0.90502413],
       [0.06803211, 0.29553909, 0.32560239, 0.31082641],
       [0.36926308, 0.03596882, 0.53054254, 0.06422556],
       [0.81665685, 0.01616142, 0.13795678, 0.02922494]])

In [40]:
# Atlast we multiply it with the v vector - This is After Attention
new_v = np.matmul(attention,v)
new_v

array([[ 1.5062644 , -1.35215475,  0.48031349, -0.28786724, -0.40148487,
         2.23991922, -2.90143267, -1.0343477 ],
       [ 1.40477097, -0.50708286,  1.16192551, -0.73857152, -2.16302254,
         1.08296214,  0.87088045,  0.89835042],
       [ 0.49716274, -0.54172004, -0.32188572, -1.06466073, -0.67773373,
         1.41100624, -1.44640703, -0.5255429 ],
       [ 1.23940967, -1.07011926,  0.2904781 , -0.47121531, -0.41940771,
         1.9247971 , -2.420658  , -0.86517448]])

In [43]:
# Before Attention
# here the first vector will be kind of similar cause all of them are 0 except the first one
# As you go to the late words we get to know how the vectors have actually become
v

array([[ 1.5062644 , -1.35215475,  0.48031349, -0.28786724, -0.40148487,
         2.23991922, -2.90143267, -1.0343477 ],
       [ 1.38140753, -0.31255013,  1.31883032, -0.84232214, -2.56852263,
         0.81663448,  1.73925434,  1.34325108],
       [-0.26513094,  0.00681401, -0.9914591 , -1.62039073, -0.74181731,
         0.87437046, -0.64967023, -0.29810682],
       [ 0.80614759,  1.30841639,  0.46847401,  0.03525922,  1.79016125,
        -1.30955226,  0.35361817, -0.03593953]])

# Combining all the code together

In [44]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

def scaled_dot_product_attention(q,k,v,mask=None):
  d_k = q.shape[-1] # We have 4 rows and 8 columns so it is (4,8) - this is the shape and it retrieves -1 which is 8 here
  scaled = np.matmul(q,k.T)/math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention,v)
  return out,attention

In [45]:
values,attention = scaled_dot_product_attention(q,k,v,mask=None)
print("Q\n",q) 
print("K\n",k)
print("V\n",v)
print("New V\n",values)
print("Attention\n",attention)

Q
 [[ 0.44008364 -0.65893923 -0.82743603  2.07134499 -0.756638    0.18606377
  -2.52939241 -0.99671064]
 [ 0.45847306  0.13226699  0.39716341 -0.35557593  0.92675916  0.86731148
  -1.0848844  -0.75466212]
 [-1.73679108  0.50542825  0.67550296 -0.48343931 -0.78572117 -0.57689201
  -0.52532605 -1.41217445]
 [-2.09387521 -0.89363413 -0.59569919 -1.71644659 -0.36390696 -0.30201351
   0.22337068 -0.50049467]]
K
 [[-0.97577445  0.8762965  -2.38211946 -1.7001989  -2.1416398   0.14183991
   0.82795221  0.06947477]
 [ 0.75104729  0.51849052  0.78832873  0.55552962  1.34575561  0.47145897
   0.51505236  1.26781612]
 [-0.43040316  1.29402735 -0.45535669 -0.2645018   0.40588437 -0.22652967
   0.51160386 -1.89667442]
 [ 0.13683429 -1.27318423 -0.18401422  1.90297292 -0.61696554  1.93138142
  -0.68380964  0.23601658]]
V
 [[ 1.5062644  -1.35215475  0.48031349 -0.28786724 -0.40148487  2.23991922
  -2.90143267 -1.0343477 ]
 [ 1.38140753 -0.31255013  1.31883032 -0.84232214 -2.56852263  0.81663448
   1.7