In [16]:
import numpy as np
import tensorflow as tf

# Sample word embeddings (3 words, embedding dim = 4)
word_embeddings = tf.constant([[1, 0, 1, 0],    # "The"
                               [0, 1, 0, 1],    # "Cat"
                               [1, 1, 1, 1]],   # "Sat"
                              dtype=tf.float32)                # create a constant tensor

# Weight matrices to compute Q, K, V (for simplicity: identity)
W_q = tf.eye(4)
W_k = tf.eye(4)
W_v = tf.eye(4)
print("Weight matrices for W_q: \n", W_q)

# Compute Q, K, V
Q = word_embeddings @ W_q      # Multiply the word_embeddings matrix with the W_q matrix to get the Query matrix (Q).
K = word_embeddings @ W_k
V = word_embeddings @ W_v

print("\n Q: \n", Q)

# Scaled Dot-Product Attention
d_k = tf.cast(tf.shape(K)[-1], tf.float32)           # d_k = 4         
scores = Q @ tf.transpose(K) / tf.math.sqrt(d_k)     # Gives the attention score matrix (shape: [3, 3]) representing how much each word should attend to the others.

# Apply softmax to get attention weights
attention_weights = tf.nn.softmax(scores, axis=-1)   # converts raw scores into probabilities (attention weights) across each row.

# Multiply attention weights by V to get output
output = attention_weights @ V

print("\nAttention Weights:\n", attention_weights.numpy())
print("\nOutput:\n", output.numpy())


Weight matrices for W_q: 
 tf.Tensor(
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]], shape=(4, 4), dtype=float32)

 Q: 
 tf.Tensor(
[[1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [1. 1. 1. 1.]], shape=(3, 4), dtype=float32)

Attention Weights:
 [[0.42231882 0.15536243 0.42231882]
 [0.15536243 0.42231882 0.42231882]
 [0.21194157 0.21194157 0.57611686]]

Output:
 [[0.84463763 0.57768124 0.84463763 0.57768124]
 [0.57768124 0.84463763 0.57768124 0.84463763]
 [0.7880584  0.7880584  0.7880584  0.7880584 ]]


self-attention

In [20]:
import numpy as np

# Dummy embeddings for 3 tokens (e.g., "I love NLP")
X = np.array([
    [1, 0],   # Token 1
    [0, 1],   # Token 2
    [1, 1]    # Token 3
])  # Shape: (3, 2)


# Weight matrices (random example values for learning)
Wq = np.array([
    [1, 0],
    [0, 1]
])  # Shape: (2, 2)

Wk = np.array([
    [0, 1],
    [1, 0]
])  # Shape: (2, 2)

Wv = np.array([
    [1, 2],
    [3, 4]
])  # Shape: (2, 2)


Q = X @ Wq.T  # Shape: (3, 2)
K = X @ Wk.T  # Shape: (3, 2)
V = X @ Wv.T  # Shape: (3, 2)

scores = Q @ K.T

dk = Q.shape[-1]  # typically dim of key
scaled_scores = scores / np.sqrt(dk)


def softmax(x):
    e_x = np.exp(x - np.max(x))  # subtract max for numerical stability
    return e_x / e_x.sum(axis=-1, keepdims=True)

attention_weights = softmax(scaled_scores)

output = attention_weights @ V


print("\nAttention Weights:\n", attention_weights)
print("\nOutput:\n", output)


Attention Weights:
 [[0.19777581 0.40111209 0.40111209]
 [0.40111209 0.19777581 0.40111209]
 [0.24825508 0.24825508 0.50348984]]

Output:
 [[2.20333628 5.00556046]
 [2.         4.80222419]
 [2.25523477 5.26221445]]
