# Single-Head-Attention & Self-Attention

### Attention

In [17]:
import math
import numpy as np
#input consists of length of 4 words sentence
# for example "my name is shaheer"
L, d_k,d_v = 4 ,8 ,8
q = np.random.randn(L,d_k)
k = np.random.randn(L,d_k)
v =np.random.randn(L,d_v)

In [4]:
print(f"Q\n{q}\n") #vector of 8x1 is created.
print(f"K\n{k}\n")  #vector of 8x1 is created.
print(f"V\n{v}\n")  #vector of 8x1 is created.

Q
[[-0.57705205 -0.20126873  2.4841818  -0.59454729  0.86675242 -0.93566926
  -0.22403582  0.24752613]
 [ 2.69861913 -0.4142783   2.40708822  1.52438144 -1.47826004 -0.94376802
   2.12568231 -0.20862321]
 [ 0.03307367 -0.08602546  1.00735884  1.18441916 -0.46719836 -0.49417361
   0.21721513 -0.0372552 ]
 [-1.04437715  1.07795544 -0.70983591 -1.87684872  1.0965299   0.78325768
   1.5372198   1.30800282]]

K
[[-0.70876639  2.87122589  0.39664913  0.5625836  -0.09371676  0.70685929
  -0.21909447  2.98217162]
 [ 1.07705745  0.83293295 -0.79823053  0.18834209 -0.59868578  0.83365767
   0.07874967 -1.6473599 ]
 [-0.79625733  0.37628871  0.10832292  0.74373004 -2.15490751 -0.375684
   1.44591142  1.65685043]
 [ 0.64180114 -1.76962332 -1.48473503 -1.19267678 -1.76738398  0.35231992
  -0.53882037 -0.27056034]]

V
[[-1.48421465  0.91421542  2.9635656   0.66601824  1.27399073 -1.84956107
  -0.95489083 -1.02632214]
 [-0.60901238  0.77696731  1.80741724 -1.76152696  1.76552493 -1.0531615
  -1.99128

### Self-Attention

In [5]:
np.matmul(q,k.T)

array([[ 0.52660777, -4.60843718, -1.2194196 , -4.80122678],
       [-2.9062646 ,  1.53650381,  5.35775067, -1.73566009],
       [ 0.33124386, -0.67084807,  2.37607251, -2.1901793 ],
       [ 6.51261514, -2.05107043,  1.49712595, -2.1296765 ]])

In [8]:
q.var(),k.var()  ,np.matmul(q,k.T).var()

(1.3635379007569473, 1.4577449689111446, 9.493123428241532)

In [11]:
scaled_values = np.matmul(q,k.T) / math.sqrt(d_k)
q.var(),k.var(),scaled_values.var() # By applying the denominator d_k now you can see that all the values are in the same range.

(1.3635379007569473, 1.4577449689111446, 1.1866404285301915)

In [16]:
scaled_values

array([[ 0.18618396, -1.62932859, -0.43112993, -1.69749001],
       [-1.0275197 ,  0.54323613,  1.89425092, -0.61364851],
       [ 0.11711239, -0.23718061,  0.84006849, -0.77434532],
       [ 2.30255716, -0.7251629 ,  0.52931396, -0.75295435]])

### Masking for Decoder

In [12]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [13]:
mask[mask==0] = -np.infty
mask[mask==1] = 0

In [14]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [20]:
# Now each current value is depending on the previous value instead of next value
scaled_values + mask

array([[ 0.18618396,        -inf,        -inf,        -inf],
       [-1.0275197 ,  0.54323613,        -inf,        -inf],
       [ 0.11711239, -0.23718061,  0.84006849,        -inf],
       [ 2.30255716, -0.7251629 ,  0.52931396, -0.75295435]])

#### Softmax

In [32]:
#x is  the input in our case its scaled_values+mask
def softmax(x):
  return(np.exp(x).T / np.sum(np.exp(x),axis=-1)).T

In [26]:
# Before adding the mask.
print(softmax(scaled_values))

[[0.53932303 0.08777723 0.29090618 0.08199356]
 [0.03861438 0.18574606 0.71722906 0.05841051]
 [0.23967928 0.1681756  0.49386282 0.0982823 ]
 [0.79032266 0.0382721  0.13418213 0.03722311]]


In [27]:
#after adding the mask
attention_before_V= softmax(scaled_values + mask)

In [28]:
attention_before_V

array([[1.        , 0.        , 0.        , 0.        ],
       [0.17210867, 0.82789133, 0.        , 0.        ],
       [0.26580301, 0.18650582, 0.54769117, 0.        ],
       [0.79032266, 0.0382721 , 0.13418213, 0.03722311]])

In [29]:
new_V = np.matmul(attention_before_V , v)
new_V

array([[ 0.81792666,  0.82236399, -0.41895922,  0.22666141,  0.94511862,
        -0.52021735, -0.64588996, -0.87000742],
       [-1.72124914, -0.11917752,  0.10030999,  0.17790366, -0.14471279,
         1.16727093,  0.56597595, -0.86661605],
       [ 0.48455696,  0.49623802,  0.30719654, -1.16219838,  0.35669711,
        -0.17322052, -1.23137942,  0.11496116],
       [ 0.78773285,  0.71521167, -0.20040347, -0.09814017,  0.75931991,
        -0.42659804, -0.7674411 , -0.59475329]])

In [30]:
v

array([[ 0.81792666,  0.82236399, -0.41895922,  0.22666141,  0.94511862,
        -0.52021735, -0.64588996, -0.87000742],
       [-2.2491133 , -0.31491269,  0.20825982,  0.16776751, -0.37127566,
         1.51807949,  0.81790832, -0.86591103],
       [ 1.25366696,  0.61418601,  0.6933022 , -2.28912817,  0.31902434,
        -0.58075768, -2.21337254,  0.92699907],
       [ 1.5894821 , -0.13651838,  0.79816503,  0.63034682, -0.43592847,
         0.11736599,  0.2340575 ,  0.04262961]])

In [35]:
def softmax(x):
  return(np.exp(x).T / np.sum(np.exp(x),axis=-1)).T
def scaled_dot_product_Attention(q,k,v,mask=None):
  d_k =q.shape[-1]
  scaled = np.matmul(q,k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention,v)
  return out , attention

### Encoder

In [37]:
#Encoder
values, attention = scaled_dot_product_Attention(q,k,v,mask=None)
print(f"Q\n{q}\n") #vector of 8x1 is created.
print(f"K\n{k}\n")  #vector of 8x1 is created.
print(f"V\n{v}\n")  #vector of 8x1 is created.
print(f"New-Values\n",values)
print(f"Attention\n",attention)

Q
[[ 1.23083442  0.23780154  0.49963663  0.66060411  0.70506491 -1.3407174
  -1.28707246  1.26635347]
 [ 0.78347705  1.09805569 -1.18751775 -0.57766779  1.38640692  0.64106228
   0.12609477  1.3256973 ]
 [-0.87707701  0.30595606  1.11351654  1.07346526 -1.35059628  0.25691575
   0.71501038 -1.12965311]
 [ 0.43742223  0.18957705 -1.63318612 -2.23190212  0.39821764  0.41517094
   0.23741383 -1.53932298]]

K
[[-1.30320263  0.0876317  -0.68980643  0.4427153   0.90717826 -0.16077661
  -0.0099921  -1.58829012]
 [-0.36743927  0.34094691  1.7134338   0.65427438  1.30200938 -0.61145828
  -1.06062647  0.71670681]
 [-1.53207548 -1.4651322  -1.29869614  1.07817591  1.13953673  0.93676069
  -0.31606087  0.34552698]
 [ 0.23910301  0.72556814 -0.74747835 -2.03170547  1.38947414  0.05281547
   0.47667075 -0.94524177]]

V
[[ 0.81792666  0.82236399 -0.41895922  0.22666141  0.94511862 -0.52021735
  -0.64588996 -0.87000742]
 [-2.2491133  -0.31491269  0.20825982  0.16776751 -0.37127566  1.51807949
   0.817

### Decoder

In [39]:
#Decoder with mask
values, attention = scaled_dot_product_Attention(q,k,v,mask=mask)
print(f"Q\n{q}\n") #vector of 8x1 is created.
print(f"K\n{k}\n")  #vector of 8x1 is created.
print(f"V\n{v}\n")  #vector of 8x1 is created.
print(f"New-Values\n",values)
print(f"Attention\n",attention)

Q
[[ 1.23083442  0.23780154  0.49963663  0.66060411  0.70506491 -1.3407174
  -1.28707246  1.26635347]
 [ 0.78347705  1.09805569 -1.18751775 -0.57766779  1.38640692  0.64106228
   0.12609477  1.3256973 ]
 [-0.87707701  0.30595606  1.11351654  1.07346526 -1.35059628  0.25691575
   0.71501038 -1.12965311]
 [ 0.43742223  0.18957705 -1.63318612 -2.23190212  0.39821764  0.41517094
   0.23741383 -1.53932298]]

K
[[-1.30320263  0.0876317  -0.68980643  0.4427153   0.90717826 -0.16077661
  -0.0099921  -1.58829012]
 [-0.36743927  0.34094691  1.7134338   0.65427438  1.30200938 -0.61145828
  -1.06062647  0.71670681]
 [-1.53207548 -1.4651322  -1.29869614  1.07817591  1.13953673  0.93676069
  -0.31606087  0.34552698]
 [ 0.23910301  0.72556814 -0.74747835 -2.03170547  1.38947414  0.05281547
   0.47667075 -0.94524177]]

V
[[ 0.81792666  0.82236399 -0.41895922  0.22666141  0.94511862 -0.52021735
  -0.64588996 -0.87000742]
 [-2.2491133  -0.31491269  0.20825982  0.16776751 -0.37127566  1.51807949
   0.817