In [1]:
import numpy as np
import math

## Q,K,V initialization

In [4]:
L, d_k, d_v = 4, 8, 8

q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

print(f"Q : {q} \n \n K : {k} \n \n V : {v} ")

Q : [[-0.1462297   0.18463536 -0.88324794 -0.55202271 -1.04985477 -1.95115135
   0.48051216  0.59826787]
 [ 1.73108945 -0.69956396  0.03309819  1.31288266  1.32810858  0.866713
   0.03702245 -1.07291343]
 [-0.25582001  0.01669755  0.57919926 -1.39397291  2.57557182  0.97933429
   0.52932116  1.76887839]
 [ 0.94190329  1.34571507 -0.09997985 -0.78801489  0.69189524  0.32106299
   0.58877883 -0.39450409]] 
 
 K : [[-0.44510571 -1.15527739 -0.03693768  0.92799771  0.25634736 -0.7138885
   0.02913203 -1.19107762]
 [-1.15198742  0.00279137 -0.14984311 -0.16562673  0.66912483 -0.48748744
  -1.70468102  0.573068  ]
 [-0.20930491  0.37378326  0.61672216  1.24796994 -0.16630931  0.03058528
   0.25065041  1.92178146]
 [-0.72183568 -0.11487186 -0.22456696 -0.93751008 -0.25207383  1.82351633
   1.68363162 -0.92528114]] 
 
 V : [[-0.51893399 -1.51751563 -0.31213065 -2.35075835 -0.97956957  1.27952883
  -0.44395342 -0.98368081]
 [ 0.59994242  0.16621628 -1.24751619 -1.84804286  2.90028913 -1.8080947

## Self attention

In [9]:
scaled = np.matmul(q,k.T)/math.sqrt(d_k)
scaled

array([[-0.07165688,  0.05839097,  0.08877678, -0.791131  ],
       [ 0.79744836, -0.8592617 , -0.42849043, -0.03772866],
       [-1.18467504,  0.63504127,  0.64029873,  0.61893284],
       [-0.80125949, -0.65729619, -0.51443423,  0.59895786]])

## Masking

In [21]:
#Used in the decoder part of the transformer
mask = np.triu(np.ones(L), k=1) 
mask[mask==1] = -np.inf
mask+scaled

array([[-0.07165688,        -inf,        -inf,        -inf],
       [ 0.79744836, -0.8592617 ,        -inf,        -inf],
       [-1.18467504,  0.63504127,  0.64029873,        -inf],
       [-0.80125949, -0.65729619, -0.51443423,  0.59895786]])

## Softmax

In [23]:
def softmax(x) : 
    return (np.exp(x).T/np.sum(np.exp(x), axis=-1)).T

In [24]:
attention = softmax(scaled+mask)

In [25]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.83979587, 0.16020413, 0.        , 0.        ],
       [0.07477899, 0.46139443, 0.46382658, 0.        ],
       [0.13257123, 0.15309879, 0.17661026, 0.53771972]])

In [26]:
new_v = np.matmul(attention, v)

In [27]:
new_v

array([[-0.51893399, -1.51751563, -0.31213065, -2.35075835, -0.97956957,
         1.27952883, -0.44395342, -0.98368081],
       [-0.33968537, -1.24777483, -0.46198328, -2.27022125, -0.3580002 ,
         0.7848788 , -0.30276508, -1.01429967],
       [ 0.59360036, -0.37003642, -1.64643195, -0.82044728,  1.27964024,
        -0.0903621 ,  0.32363391, -0.56046633],
       [ 0.58844888, -0.40773481, -0.72570967, -1.05271969,  0.44953883,
         1.37158202,  1.32487756, -1.3007136 ]])