In [7]:
import numpy as np
import pandas as pd
import math

In [8]:
# Here 
# L - Length of the input sequence (my name is vibhav)
# 8 - This is the size of each of the vector
# Every one will be of size - (4x8)
L, d_k, d_v = 4,8,8
# query vector
q = np.random.randn(L, d_k)
# key vector
k = np.random.randn(L, d_k)
# value vector
v = np.random.randn(L, d_v)

In [9]:
print("Q\n",q)
print("K\n",k)
print("V\n",v)

Q
 [[ 1.09761055  0.25431486 -0.64855585 -1.1783437  -1.78496968 -0.91920149
  -0.24095985 -1.04872552]
 [-0.08069951  0.12154865  0.54870725 -0.1662838  -1.13616968 -0.38030652
   0.92407845  1.88154461]
 [ 0.12960108  0.9027782   1.62437549  1.25708041  2.22449749 -1.30507629
   1.46377707 -1.0209322 ]
 [ 2.22076171  0.45139842 -1.52001994 -1.65998052  0.71400261 -0.54140543
  -0.62444227 -3.20534793]]
K
 [[-0.21165531 -0.73893031 -0.85166574 -0.2933164   0.45192866 -0.83067114
  -1.56054348 -0.64977261]
 [-0.54152624  2.02183667 -2.1343795   0.00595999  1.08572903 -2.48480572
   1.38222672 -0.33252892]
 [ 1.21186875 -1.07273332 -0.75972709  1.00311054 -0.94099936  0.74606439
   0.48767779 -0.62625095]
 [ 0.1311618  -1.03994281 -0.05969253 -0.33845304  1.84588101  1.84488924
  -0.54124462 -0.08792945]]
V
 [[ 0.80998809  0.46072378  0.84466935  0.15256506 -0.01857584  0.51588124
   0.56995535 -0.11847627]
 [ 0.79882088 -0.52920189 -0.72286384 -0.43575666 -0.5270247  -0.81931937
  -0.4

In [None]:
# In order to create a SELF ATTENTION MATRIX, we needd every word to look at every single other word - To see whether it has higher affinity towards it or not either by using any of the vector embedding technique
# The above concept is represented by the Query(Q) - what i am looking for ?
np.matmul(q,k.T)


array([[ 1.49208091,  1.65875372,  1.90119079, -4.45101487],
       [-3.35347564, -0.5196548 , -0.75412008, -3.57791591],
       [-1.9781788 ,  6.31636994, -2.49818255, -0.44831974],
       [ 4.80748514,  5.26770984,  2.32372421,  1.41336116]])

In [11]:
# why do we need sqrt(d_k) in the denominator - In order to minimize the variance
q.var(), k.var(), np.matmul(q,k.T).var()

(1.591199848655795, 1.1749383373971596, 9.886851083364917)

In [12]:
scaled = np.matmul(q,k.T)/math.sqrt(d_k)
q.var(),k.var(),scaled.var()

(1.591199848655795, 1.1749383373971596, 1.2358563854206144)

In [13]:
scaled

array([[ 0.52753027,  0.586458  ,  0.67217245, -1.5736714 ],
       [-1.18563268, -0.18372572, -0.26662171, -1.2649843 ],
       [-0.69939182,  2.23317401, -0.88324091, -0.15850496],
       [ 1.69970267,  1.86241668,  0.82156057,  0.49969863]])

# Masking
* This is to ensure words don't get context from words generate in the future
* This concept won't be used in the encoders, but this will be helpful in the decoders
* So that we don't look at a future word while generating the current context
### Why it is not required for encoders
* As our inputs are passed simultaneously

In [16]:
# Why are we doing this so that 
# my can look at only itself 
# name can look at my name
# is can look at my name is
# vibhav can look at my name is vibhav
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [17]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [18]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [21]:
# Anyways we are not going to consider any context from it - so we replace them to be (-inf)
scaled + mask
# This is because of the softmax operation which we will be going to perform next

array([[ 0.52753027,        -inf,        -inf,        -inf],
       [-1.18563268, -0.18372572,        -inf,        -inf],
       [-0.69939182,  2.23317401, -0.88324091,        -inf],
       [ 1.69970267,  1.86241668,  0.82156057,  0.49969863]])

# Softmax
* This operation is used to convert vector into a prob distribution - so that their values add up to 1 and they are interpretable and stable

In [22]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

In [33]:
# This shouldn't be the case - because each row adds to 1
attention_diff = softmax(scaled)
attention_diff

array([[0.2995237 , 0.31770437, 0.34613738, 0.03663455],
       [0.13978203, 0.38069223, 0.35040697, 0.12911877],
       [0.04479216, 0.84100616, 0.03726983, 0.07693185],
       [0.3456085 , 0.40667756, 0.14361908, 0.10409486]])

In [36]:
# This is the right method - cause as you can see for the first row it is 1.
# similarly for the next row first two columns add to 1
# similarly for the next row first three columns add to 1
# for the last row all the 4 columns add to 1
attention = softmax(scaled + mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.26856665, 0.73143335, 0.        , 0.        ],
       [0.0485253 , 0.91109867, 0.04037603, 0.        ],
       [0.3456085 , 0.40667756, 0.14361908, 0.10409486]])

In [37]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.80998809,  0.46072378,  0.84466935,  0.15256506, -0.01857584,
         0.51588124,  0.56995535, -0.11847627],
       [ 0.80182002, -0.26334086, -0.3018767 , -0.27775306, -0.39047229,
        -0.46072901, -0.19423768,  0.19252355],
       [ 0.72180631, -0.46350539, -0.68722248, -0.39010969, -0.52045157,
        -0.73320403, -0.40640995,  0.30473801],
       [ 0.33936543, -0.01678049, -0.1800458 , -0.16997562, -0.43286706,
        -0.14592593, -0.08747941,  0.23483121]])

In [38]:
v

array([[ 0.80998809,  0.46072378,  0.84466935,  0.15256506, -0.01857584,
         0.51588124,  0.56995535, -0.11847627],
       [ 0.79882088, -0.52920189, -0.72286384, -0.43575666, -0.5270247 ,
        -0.81931937, -0.47483298,  0.30671602],
       [-1.12203332, -0.09181229, -1.72404342, -0.0122756 , -0.97529801,
        -0.29117605, -0.03585067,  0.76873726],
       [-1.00187918,  0.50329083,  0.66869131, -0.42007772, -0.69212416,
         0.48800077, -0.82816828,  0.39039161]])

# Combined Code

In [39]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=-1)).T

def scaled_dot_product_attention(q,k,v,mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q,k.T)/math.sqrt(d_k)
  if mask is not None:
    scaled = scaled  + mask
  attention = softmax(scaled)
  out = np.matmul(attention,v)
  return out,attention

In [41]:
# This is for the Encoder
values,attention = scaled_dot_product_attention(q,k,v,mask=None)
print("Q\n",q)
print("K\n",k)
print("V\n",v)
print("New V\n",values)
print("Attention\n",attention)

Q
 [[ 1.09761055  0.25431486 -0.64855585 -1.1783437  -1.78496968 -0.91920149
  -0.24095985 -1.04872552]
 [-0.08069951  0.12154865  0.54870725 -0.1662838  -1.13616968 -0.38030652
   0.92407845  1.88154461]
 [ 0.12960108  0.9027782   1.62437549  1.25708041  2.22449749 -1.30507629
   1.46377707 -1.0209322 ]
 [ 2.22076171  0.45139842 -1.52001994 -1.65998052  0.71400261 -0.54140543
  -0.62444227 -3.20534793]]
K
 [[-0.21165531 -0.73893031 -0.85166574 -0.2933164   0.45192866 -0.83067114
  -1.56054348 -0.64977261]
 [-0.54152624  2.02183667 -2.1343795   0.00595999  1.08572903 -2.48480572
   1.38222672 -0.33252892]
 [ 1.21186875 -1.07273332 -0.75972709  1.00311054 -0.94099936  0.74606439
   0.48767779 -0.62625095]
 [ 0.1311618  -1.03994281 -0.05969253 -0.33845304  1.84588101  1.84488924
  -0.54124462 -0.08792945]]
V
 [[ 0.80998809  0.46072378  0.84466935  0.15256506 -0.01857584  0.51588124
   0.56995535 -0.11847627]
 [ 0.79882088 -0.52920189 -0.72286384 -0.43575666 -0.5270247  -0.81931937
  -0.4

In [42]:
# This is for the Decoder
values,attention = scaled_dot_product_attention(q,k,v,mask=mask)
print("Q\n",q)
print("K\n",k)
print("V\n",v)
print("New V\n",values)
print("Attention\n",attention)

Q
 [[ 1.09761055  0.25431486 -0.64855585 -1.1783437  -1.78496968 -0.91920149
  -0.24095985 -1.04872552]
 [-0.08069951  0.12154865  0.54870725 -0.1662838  -1.13616968 -0.38030652
   0.92407845  1.88154461]
 [ 0.12960108  0.9027782   1.62437549  1.25708041  2.22449749 -1.30507629
   1.46377707 -1.0209322 ]
 [ 2.22076171  0.45139842 -1.52001994 -1.65998052  0.71400261 -0.54140543
  -0.62444227 -3.20534793]]
K
 [[-0.21165531 -0.73893031 -0.85166574 -0.2933164   0.45192866 -0.83067114
  -1.56054348 -0.64977261]
 [-0.54152624  2.02183667 -2.1343795   0.00595999  1.08572903 -2.48480572
   1.38222672 -0.33252892]
 [ 1.21186875 -1.07273332 -0.75972709  1.00311054 -0.94099936  0.74606439
   0.48767779 -0.62625095]
 [ 0.1311618  -1.03994281 -0.05969253 -0.33845304  1.84588101  1.84488924
  -0.54124462 -0.08792945]]
V
 [[ 0.80998809  0.46072378  0.84466935  0.15256506 -0.01857584  0.51588124
   0.56995535 -0.11847627]
 [ 0.79882088 -0.52920189 -0.72286384 -0.43575666 -0.5270247  -0.81931937
  -0.4