# Self Attention in Transformers

## Generate Data

In [18]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [19]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 0.40610019 -0.13526347 -1.37776428 -1.08112196 -0.57635259 -0.03953952
   0.11635102  0.39402089]
 [ 0.1612059  -0.7991379  -0.07111323 -1.59247379  0.44644414  0.19005893
  -0.77327419  1.38224314]
 [-0.10534454 -1.46244734 -1.30305999 -0.16139127  0.73879523 -1.22283228
   1.04221191  0.64872383]
 [ 1.54265598 -0.25274361 -1.58167994 -1.60770047  0.10612887  0.405868
   0.96878068  0.1977659 ]]
K
 [[ 0.71096647 -0.78364191 -0.95720498 -0.94627781 -0.6067886  -1.80238561
  -0.02087386 -0.65559097]
 [-1.34398827 -0.39491779 -2.14325492 -0.56903127  0.94809213  0.70829332
   0.44953989  2.14601746]
 [ 1.48705615 -1.10012906  1.12829033  1.55368213  0.15887659  0.43322693
  -1.25391585 -0.66799573]
 [-0.58324277  0.34982954  1.24943425 -1.4019293  -0.0124756   1.45699666
  -0.05658199  0.42648847]]
V
 [[-0.75779633 -0.59572099 -1.01135183 -1.11617748 -0.21316834 -2.47143742
   0.33301725  0.30441697]
 [ 0.11647353  0.6693205  -0.07213081  0.53119398 -0.49528941  0.51991687
   0.811

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [20]:
np.matmul(q, k.T)

array([[ 2.8968107 ,  3.39915553, -2.99933411, -0.37890009],
       [ 0.81234089,  4.33410403, -1.23600096,  2.67470985],
       [ 3.77982538,  5.2987652 , -2.42133054, -3.42517113],
       [ 3.38435619,  3.57928472, -2.8645649 , -0.09092919]])

In [21]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(0.7780729696708955, 1.1114415393813015, 8.258030476887914)

In [22]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.7780729696708955, 1.1114415393813015, 1.0322538096109892)

Notice the reduction in variance of the product

In [23]:
scaled

array([[ 1.02417724,  1.20178296, -1.06042474, -0.13396141],
       [ 0.28720587,  1.53233718, -0.43699233,  0.94565274],
       [ 1.33637008,  1.8733964 , -0.85606962, -1.21098087],
       [ 1.1965506 ,  1.26546825, -1.01277663, -0.03214832]])

## Masking

- This is to ensure words don't get context from words generated in the future. 
- Not required in the encoders, but required int he decoders

In [24]:
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [25]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [26]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [27]:
scaled + mask

array([[ 1.02417724,        -inf,        -inf,        -inf],
       [ 0.28720587,  1.53233718,        -inf,        -inf],
       [ 1.33637008,  1.8733964 , -0.85606962,        -inf],
       [ 1.1965506 ,  1.26546825, -1.01277663, -0.03214832]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [28]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [29]:
attention = softmax(scaled + mask)

In [30]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.22354407, 0.77645593, 0.        , 0.        ],
       [0.35428885, 0.60615691, 0.03955424, 0.        ],
       [0.40423712, 0.43307862, 0.04437492, 0.11830934]])

In [31]:
new_v = np.matmul(attention, v)
new_v

array([[-0.75779633, -0.59572099, -1.01135183, -1.11617748, -0.21316834,
        -2.47143742,  0.33301725,  0.30441697],
       [-0.07896431,  0.38652797, -0.2820881 ,  0.16293386, -0.43222292,
        -0.14878265,  0.70472926, -0.49665357],
       [-0.1285842 ,  0.21538388, -0.39728488, -0.08835537, -0.33089969,
        -0.48489904,  0.61760579, -0.26779679],
       [-0.35600552, -0.13923493, -0.49170278, -0.25791597, -0.13755022,
        -0.85399827,  0.55867032, -0.06368881]])

In [32]:
v

array([[-0.75779633, -0.59572099, -1.01135183, -1.11617748, -0.21316834,
        -2.47143742,  0.33301725,  0.30441697],
       [ 0.11647353,  0.6693205 , -0.07213081,  0.53119398, -0.49528941,
         0.51991687,  0.8117463 , -0.72728427],
       [ 1.75185666,  0.52403856,  0.12004752, -0.37652174,  1.13379961,
         1.91009846,  0.19153106,  1.64837091],
       [-1.5033201 , -1.78806643, -0.48149724, -0.16952366,  0.95349471,
        -1.39360819,  0.54098215,  0.46555489]])

# Function

In [33]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [34]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 0.40610019 -0.13526347 -1.37776428 -1.08112196 -0.57635259 -0.03953952
   0.11635102  0.39402089]
 [ 0.1612059  -0.7991379  -0.07111323 -1.59247379  0.44644414  0.19005893
  -0.77327419  1.38224314]
 [-0.10534454 -1.46244734 -1.30305999 -0.16139127  0.73879523 -1.22283228
   1.04221191  0.64872383]
 [ 1.54265598 -0.25274361 -1.58167994 -1.60770047  0.10612887  0.405868
   0.96878068  0.1977659 ]]
K
 [[ 0.71096647 -0.78364191 -0.95720498 -0.94627781 -0.6067886  -1.80238561
  -0.02087386 -0.65559097]
 [-1.34398827 -0.39491779 -2.14325492 -0.56903127  0.94809213  0.70829332
   0.44953989  2.14601746]
 [ 1.48705615 -1.10012906  1.12829033  1.55368213  0.15887659  0.43322693
  -1.25391585 -0.66799573]
 [-0.58324277  0.34982954  1.24943425 -1.4019293  -0.0124756   1.45699666
  -0.05658199  0.42648847]]
V
 [[-0.75779633 -0.59572099 -1.01135183 -1.11617748 -0.21316834 -2.47143742
   0.33301725  0.30441697]
 [ 0.11647353  0.6693205  -0.07213081  0.53119398 -0.49528941  0.51991687
   0.811