In [37]:
import numpy as np

In [38]:
# Step 1: Define input embeddings (3 tokens, 2d)
#assume these 2 dimentions of word embeddings capture two features like(Gender,DC)
tokens = ['I', 'am', 'Batman']
X = np.array([
    [1.0, 0.0],  
    [0.0, 0.0],  
    [0.0, 1.0]   
])


#In BERT it is 768 dimensions

In [39]:
# Step 2: Define weights, which can be learned while training(back propagation..)
W_q = np.array([[1.0, 2.0],     #--> Query weights
                [3.0, 4.0]])

W_k = np.array([[0.5, 1.0],    # ---> Key weights
                [1.5, 1.0]])

W_v = np.array([[1.0, 0.5],    # ---> Values weights
                [0.5, 1.0]])

In [40]:
#step 3 get Q,K,V from weights..
Q = X @ W_q.T   #Transpose because to match column number of embeddings to row number of weights for to get the shape (n_rows, n_weights) 
K = X @ W_k.T 
V = X @ W_v.T 

In [41]:
Q

array([[1., 3.],
       [0., 0.],
       [2., 4.]])

In [42]:
K

array([[0.5, 1.5],
       [0. , 0. ],
       [1. , 1. ]])

In [43]:
# Step 4: Compute attention scores (Q @ K.T)
scores = Q @ K.T

In [44]:
scores

array([[5., 0., 4.],
       [0., 0., 0.],
       [7., 0., 6.]])

In [46]:
# Step 5: Scale the scores
d_k = Q.shape[1]
scaled_scores = scores / np.sqrt(d_k)

In [47]:
scaled_scores

array([[3.53553391, 0.        , 2.82842712],
       [0.        , 0.        , 0.        ],
       [4.94974747, 0.        , 4.24264069]])

In [49]:
# Step 6: Apply softmax row-wise
def softmax(x):
    e_x = np.exp(x - np.max(x))  # stability trick
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

attention_weights = np.apply_along_axis(softmax, axis=1, arr=scaled_scores)

In [50]:
attention_weights

array([[0.65693877, 0.01914529, 0.32391594],
       [0.33333333, 0.33333333, 0.33333333],
       [0.66659828, 0.00472298, 0.32867874]])

In [51]:
# Step 7: Weighted sum of V using attention weights
output = attention_weights @ V

In [34]:
V

array([[1. , 0.5],
       [0. , 0. ],
       [0.5, 1. ]])

In [53]:
output

array([[0.81889674, 0.65238532],
       [0.5       , 0.5       ],
       [0.83093765, 0.66197788]])

In [36]:
print("Input tokens:      ", tokens)
print("Q:\n", Q)
print("K:\n", K)
print("V:\n", V)
print("Attention scores:\n", scaled_scores)
print("Attention weights:\n", attention_weights)
print("Self-attention output:\n", output)

Input tokens:       ['I', 'am', 'Batman']
Q:
 [[1. 3.]
 [0. 0.]
 [2. 4.]]
K:
 [[0.5 1.5]
 [0.  0. ]
 [1.  1. ]]
V:
 [[1.  0.5]
 [0.  0. ]
 [0.5 1. ]]
Attention scores:
 [[3.53553391 0.         2.82842712]
 [0.         0.         0.        ]
 [4.94974747 0.         4.24264069]]
Attention weights:
 [[0.65693877 0.01914529 0.32391594]
 [0.33333333 0.33333333 0.33333333]
 [0.66659828 0.00472298 0.32867874]]
Self-attention output:
 [[0.81889674 0.65238532]
 [0.5        0.5       ]
 [0.83093765 0.66197788]]
