In [1]:
import numpy as np
from scipy.special import softmax

## 1. Represent the input

In [36]:
# d_model = 4
# 3 inputs
d_k = 3
x = np.array([[1.0, 0.0, 1.0, 0.0],
             [0.0, 2.0, 0.0, 2.0], 
             [1.0, 1.0, 1.0, 1.0]])
print(x, x.shape)

[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]] (3, 4)


## 2. Initialize weights

In [37]:
# even though d_k = 64, lets scale it down to 3 for better visualization
# weights for query(Q)
w_query = np.array([[1,0,1],
                    [1,0,0],
                    [0,0,1],
                    [0,1,1]])
print(f'Weights (Q) 3 x d_model=4\n', w_query, w_query.shape)

# weights for key(K)
w_key = np.array([[0,0,1],
                  [1,1,0],
                  [0,1,0],
                  [1,1,0]])
print(f'Weights (K) 3 x d_model=4\n', w_key, w_key.shape)

# weights for value(V)
w_value = np.array([[0,2,0],
                    [0,3,0],
                    [1,0,3],
                    [1,1,0]])
print(f'Weights (V) 3 x d_model=4\n', w_value, w_value.shape)

Weights (Q) 3 x d_model=4
 [[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]] (4, 3)
Weights (K) 3 x d_model=4
 [[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]] (4, 3)
Weights (V) 3 x d_model=4
 [[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]] (4, 3)


## 3. Matrix Multiplication to get Q, K, V

In [38]:
Q = np.matmul(x, w_query)
K = np.matmul(x, w_key)
V = np.matmul(x, w_value)
print(f'Query(Q):\n {Q}')
print(f'Key(K):\n {K}')
print(f'Value(V):\n {V}')

Query(Q):
 [[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]
Key(K):
 [[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]
Value(V):
 [[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


## Transformer Equation for Attention Scores

$$ \large
Attention(Q,K,V) = softmax(\frac{QK^T}{\sqrt{d_k}})V
$$

## 4. Scaled Attention Scores (Q, K)

In [39]:
#d_k = sqrt(3) but assuming it to be 1
root_dk = 1
attention_scores = (Q@ K.T) / root_dk
print(f'attention_scores:\n {attention_scores}')

attention_scores:
 [[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


## 5. Scaled Softmax Attention Scores for Each Vector

In [40]:
for i in range(attention_scores.shape[1]):
    attention_scores[i] = softmax(attention_scores[i])
print(f'attention_scores:\n {attention_scores}')

attention_scores:
 [[6.33789383e-02 4.68310531e-01 4.68310531e-01]
 [6.03366485e-06 9.82007865e-01 1.79861014e-02]
 [2.95387223e-04 8.80536902e-01 1.19167711e-01]]


## 6. Final Attention

In [41]:
V.shape

(3, 3)

In [42]:
attention_scores.shape

(3, 3)

In [43]:
attention_scores[0].reshape(1,-1).shape

(1, 3)

In [46]:
attentions = []
print(f'Attention is:')
for i in range(d_k):
    # print(f"Attention Scores shape: {attention_scores[i].shape}")
    attention_scores[i] = attention_scores[i].reshape(1,-1)
    # print(f"Attention Scores shape After Reshape: {attention_scores[i].shape}")
    attentions.append(attention_scores[0][i] * V[i])
    print(attentions[-1])

Attention is:
[0.06337894 0.12675788 0.19013681]
[0.93662106 3.74648425 0.        ]
[0.93662106 2.80986319 1.40493159]


Each row of attention is calculated attention of Q, K and V for each input. \
i.e 1st row -> For x1 input and so on.

## 7. Sum up the result attention

In [50]:
attention_input1 = [a+b+c for a,b,c in attentions]
print(f'Sum of Attentions is: \n{attention_input1}')

Sum of Attentions is: 
[0.3802736299982257, 4.683105308334811, 5.151415839168293]


## 8. For dimension of the model as d_k = 64

In [58]:
np.sum(attention_head1[i][:])

29.314707299976895

In [57]:
attention_head1 = np.random.random((3,64))
print(attention_head1.shape)
attention_inputs = []
for i in range(attention_head1.shape[0]):
    attention_inputs.append(np.sum(attention_head1[i][:]))
print(attention_inputs)

(3, 64)
[29.797725576291956, 28.84816890358251, 29.314707299976895]
