# Self Attention in Transformers

## Generate Data

In [1]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [2]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 0.89917746  0.21502107 -0.59958722 -0.55723961 -0.49824304 -1.41820529
  -0.42197575 -0.2317329 ]
 [ 0.622668   -0.74936698  0.01161096 -1.11868925  0.23696572  0.52514834
   0.55972373 -2.01310969]
 [-1.16504065  0.86816784 -0.85159753  0.58881758  0.18209637 -0.14922834
  -0.08030625  0.89014178]
 [-1.41261991  0.70759187  0.22305526  0.97933654 -0.10309459  0.66098278
   1.93275013  1.73698403]]
K
 [[-0.14740497  0.0108732   0.61592921  1.11403729 -0.36854029 -0.44968157
  -0.33815192 -0.44581356]
 [ 0.30327292 -1.13620901  0.62375687  0.16713418  0.48839604 -1.95342897
  -0.48498931 -0.42216305]
 [ 0.52960127  1.13138675 -1.44877434 -0.21365629 -0.56213552  2.24652558
   0.66100044  0.2716143 ]
 [-0.31181308  1.0572872  -1.61441829 -0.56380099 -0.9620224  -0.93774973
   0.69809179 -2.5545543 ]]
V
 [[ 1.30626608 -0.39773238 -0.41362384 -1.05023084  0.85346049  1.52822707
  -0.10596725 -1.45341744]
 [ 0.48240463 -1.12770688  1.48405035 -1.22671258 -0.34672049  1.1323596
   0.15

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [3]:
np.matmul(q, k.T)

array([[-0.05292925,  2.39076302, -1.54062064,  3.3357596 ],
       [-0.95432338,  0.52884069,  0.57387581,  4.43843352],
       [-0.05707269, -1.72891884,  1.2242793 , -0.04118223],
       [-0.24284681, -3.94175629,  2.81224287, -3.33229434]])

In [4]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(0.8015709706411361, 0.9474070707155748, 4.925834680355804)

In [5]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.8015709706411361, 0.9474070707155748, 0.6157293350444755)

Notice the reduction in variance of the product

In [6]:
scaled

array([[-0.01871332,  0.84526237, -0.54469165,  1.17936912],
       [-0.33740427,  0.18697342,  0.20289574,  1.56922322],
       [-0.02017824, -0.61126512,  0.4328481 , -0.01456012],
       [-0.08585931, -1.3936213 ,  0.994278  , -1.17814396]])

## Masking

- This is to ensure words don't get context from words generated in the future. 
- Not required in the encoders, but required int he decoders

In [7]:
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [8]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [9]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [10]:
scaled + mask

array([[-0.01871332,        -inf,        -inf,        -inf],
       [-0.33740427,  0.18697342,        -inf,        -inf],
       [-0.02017824, -0.61126512,  0.4328481 ,        -inf],
       [-0.08585931, -1.3936213 ,  0.994278  , -1.17814396]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [11]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [12]:
attention = softmax(scaled + mask)

In [13]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.37182916, 0.62817084, 0.        , 0.        ],
       [0.31981673, 0.17709056, 0.50309271, 0.        ],
       [0.21973397, 0.05942147, 0.64713497, 0.07370959]])

In [14]:
new_v = np.matmul(attention, v)
new_v

array([[ 1.30626608, -0.39773238, -0.41362384, -1.05023084,  0.85346049,
         1.52822707, -0.10596725, -1.45341744],
       [ 0.78874034, -0.85628108,  0.77843976, -1.16109152,  0.09954179,
         1.27955466,  0.05816868, -0.58337488],
       [ 0.95337121, -0.39452002, -0.03744969, -0.37716263, -0.45422154,
         1.13582864, -0.38722087, -0.72252694],
       [ 0.93986455, -0.326971  , -0.32171632, -0.01301897, -0.72423298,
         0.89341637, -0.52191727, -0.66620089]])

In [15]:
v

array([[ 1.30626608, -0.39773238, -0.41362384, -1.05023084,  0.85346049,
         1.52822707, -0.10596725, -1.45341744],
       [ 0.48240463, -1.12770688,  1.48405035, -1.22671258, -0.34672049,
         1.1323596 ,  0.15532462, -0.06837614],
       [ 0.89481748, -0.13439333, -0.33388911,  0.3497526 , -1.3233576 ,
         0.88760154, -0.75699228, -0.48816243],
       [ 0.61186932, -1.16124738, -1.39659229,  0.87245559, -0.47176501,
        -1.14057745, -0.24401315, -0.36448067]])

# Function

In [16]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [17]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 0.89917746  0.21502107 -0.59958722 -0.55723961 -0.49824304 -1.41820529
  -0.42197575 -0.2317329 ]
 [ 0.622668   -0.74936698  0.01161096 -1.11868925  0.23696572  0.52514834
   0.55972373 -2.01310969]
 [-1.16504065  0.86816784 -0.85159753  0.58881758  0.18209637 -0.14922834
  -0.08030625  0.89014178]
 [-1.41261991  0.70759187  0.22305526  0.97933654 -0.10309459  0.66098278
   1.93275013  1.73698403]]
K
 [[-0.14740497  0.0108732   0.61592921  1.11403729 -0.36854029 -0.44968157
  -0.33815192 -0.44581356]
 [ 0.30327292 -1.13620901  0.62375687  0.16713418  0.48839604 -1.95342897
  -0.48498931 -0.42216305]
 [ 0.52960127  1.13138675 -1.44877434 -0.21365629 -0.56213552  2.24652558
   0.66100044  0.2716143 ]
 [-0.31181308  1.0572872  -1.61441829 -0.56380099 -0.9620224  -0.93774973
   0.69809179 -2.5545543 ]]
V
 [[ 1.30626608 -0.39773238 -0.41362384 -1.05023084  0.85346049  1.52822707
  -0.10596725 -1.45341744]
 [ 0.48240463 -1.12770688  1.48405035 -1.22671258 -0.34672049  1.1323596
   0.15