# 1. Multi-head attention

In [26]:
import re
import numpy as np
from gensim.models import Word2Vec

text = "The animal did not cross the street because it was tired."
tokens = re.findall(r"\w+", text.lower())

sentences = [tokens]

word2vec_model = Word2Vec(
    sentences,
    vector_size=64,
    window=3,
    min_count=1,
    sg=1
)

X = np.array([word2vec_model.wv[word] for word in tokens])  # (seq_len, d_model)

def positional_encoding(seq_len, d_model):
    PE = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            PE[pos, i] = np.sin(pos / (10000 ** (i/d_model)))
            PE[pos, i+1] = np.cos(pos / (10000 ** (i/d_model)))
    return PE

X = X + positional_encoding(len(tokens), 64)

d_model = 64
num_heads = 8
d_k = d_v = d_model // num_heads
seq_len = X.shape[0]

W_Q = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_K = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_V = np.random.randn(d_model, d_model) / np.sqrt(d_model)
W_O = np.random.randn(d_model, d_model) / np.sqrt(d_model)

Q = np.matmul(X, W_Q)
K = np.matmul(X, W_K)
V = np.matmul(X, W_V)

Q = Q.reshape(seq_len, num_heads, d_k).transpose(1,0,2)
K = K.reshape(seq_len, num_heads, d_k).transpose(1,0,2)
V = V.reshape(seq_len, num_heads, d_v).transpose(1,0,2)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

# Attention per head

heads = []
attention_weights = []

for h in range(num_heads):
    scores = np.matmul(Q[h], np.transpose(K[h]))
    scores = scores / np.sqrt(d_k)
    weights = softmax(scores)
    attention_weights.append(weights)
    head = np.matmul(weights, V[h])
    heads.append(head)

attention_weights = np.array(attention_weights)

concat = np.concatenate(heads, axis=-1)

output = np.matmul(concat, W_O)

# 2. Add & Norm

In [27]:
residual_1 = X + output

In [28]:
def layer_norm(x,   eps=1e-6):
    mean = np.mean(x, axis=-1, keepdims = True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + eps)

In [29]:
Z1 = layer_norm(residual_1)
print(Z1)

[[-2.29273112e+00  1.30959137e-01 -1.08921281e+00  1.19285061e+00
  -4.77243024e-01 -5.50448013e-01 -8.07827595e-01  1.17195286e+00
   1.45766140e-01  4.85671665e-01 -4.95144367e-01  1.16525398e+00
  -4.91917295e-01  7.91929382e-01 -8.80785357e-01  4.73727454e-01
   9.44627997e-01 -3.04762188e-01 -3.98028893e-01  2.32567211e-02
  -9.51372681e-01 -3.37252636e-01  4.08138218e-02  1.51006150e+00
  -2.37335883e+00  5.51404488e-01 -7.39795400e-01  1.49929438e+00
  -1.27731472e+00  3.11810120e-01 -6.72268788e-01  3.94887864e-01
  -3.86202813e-01 -1.22388756e-01 -6.62562419e-01  1.28645625e+00
  -2.19720733e-01  3.51242428e-01 -6.58905312e-01 -7.74640068e-02
  -1.79063887e+00  1.34415203e+00 -1.77574501e+00  1.04404298e+00
  -7.25967457e-02 -4.91043009e-01 -3.45125670e-01  9.54061556e-01
   3.60521022e-01  1.30162417e+00  2.30100633e-02  9.35219661e-01
   1.58880997e-02 -4.25080616e-01 -7.57058696e-01  1.97471654e+00
  -1.20048220e+00  2.59132477e+00  7.16690051e-02 -4.72429103e-02
  -1.67702

# 3. Feed Forward Network
FFN(x)=max(0,xW1​+b1​)W2​+b2​

apply FFN to each token independently.

In [30]:
d_model = 64  # input layer
d_ff = 256  # hidden layer

W1 = np.random.randn(d_model, d_ff) / np.sqrt(d_model)
b1 = np.zeros(d_ff)

W2 = np.random.randn(d_ff, d_model) / np.sqrt(d_ff)
b2 = np.zeros(d_model)

In [31]:
def relu(x):
    return np.maximum(0,x)

In [32]:
def feed_forward(x):
    hidden = relu(np.matmul(x, W1) + b1)
    output = np.matmul(hidden, W2) + b2
    return output

In [33]:
ffn_output = feed_forward(Z1)
print(ffn_output.shape)

(11, 64)


# 4. Add & Norm

In [34]:
residual_2 = Z1 + ffn_output

In [35]:
Z2 = layer_norm(residual_2)
print(Z2.shape)

(11, 64)


# Evaluation

In [36]:
# Step 1: Get token indices
idx_it = tokens.index("it")
idx_animal = tokens.index("animal")

In [37]:
# Step 2: Check per-head attention

baseline = 1 / len(tokens)

for h in range(num_heads):
    val = attention_weights[h, idx_it, idx_animal]
    print(f"Head {h}: it -> animal = {val:.4f} (baseline = {baseline:.4f})")


Head 0: it -> animal = 0.0985 (baseline = 0.0909)
Head 1: it -> animal = 0.0983 (baseline = 0.0909)
Head 2: it -> animal = 0.0629 (baseline = 0.0909)
Head 3: it -> animal = 0.0866 (baseline = 0.0909)
Head 4: it -> animal = 0.0673 (baseline = 0.0909)
Head 5: it -> animal = 0.1049 (baseline = 0.0909)
Head 6: it -> animal = 0.0902 (baseline = 0.0909)
Head 7: it -> animal = 0.0745 (baseline = 0.0909)


In [38]:
# Step 3: Top-k words attended by "it"

for h in range(num_heads):
    row = attention_weights[h, idx_it]
    top = np.argsort(row)[::-1][:5]

    print(f"\nHead {h} - top attention for 'it': ")
    for i in top:
        print(f"{tokens[i]:>10s}: {row[i]:.4f}")


Head 0 - top attention for 'it': 
       the: 0.1084
    animal: 0.0985
       the: 0.0957
    street: 0.0954
     cross: 0.0945

Head 1 - top attention for 'it': 
       the: 0.1359
     tired: 0.1144
    animal: 0.0983
       was: 0.0941
    street: 0.0850

Head 2 - top attention for 'it': 
       the: 0.1368
     cross: 0.1223
    street: 0.1161
     tired: 0.1032
       not: 0.0926

Head 3 - top attention for 'it': 
    street: 0.0955
   because: 0.0951
       was: 0.0940
     tired: 0.0939
        it: 0.0935

Head 4 - top attention for 'it': 
     tired: 0.1243
       was: 0.1115
        it: 0.1031
   because: 0.1002
    street: 0.0986

Head 5 - top attention for 'it': 
       did: 0.1098
       not: 0.1084
    animal: 0.1049
     tired: 0.0983
     cross: 0.0971

Head 6 - top attention for 'it': 
       the: 0.1040
    street: 0.1020
       the: 0.1010
     cross: 0.0955
   because: 0.0938

Head 7 - top attention for 'it': 
       not: 0.1384
     cross: 0.1296
       did: 0.108