In [1]:
import numpy as np  # Import NumPy for matrix operations

# Step 1: Define Input (Word Embeddings)
# Each row represents a word, each column represents a feature.
# We have 3 words, each with 4 features.
x = np.array([[1, 0, 1, 0],
              [0, 2, 0, 2],
              [1, 1, 1, 1]])

# Step 2: Initialize Weight Matrices (Random Values)
# These matrices transform input into Queries (Q), Keys (K), and Values (V)
W_q = np.random.rand(4, 4)  # Query weight matrix (4x4)
W_k = np.random.rand(4, 4)  # Key weight matrix (4x4)
W_v = np.random.rand(4, 4)  # Value weight matrix (4x4)

# Step 3: Compute Q, K, V Matrices
# Q, K, V are computed by multiplying input (x) with respective weight matrices.
Q = np.dot(x, W_q)  # Query matrix (3x4) = (3x4) * (4x4)
K = np.dot(x, W_k)  # Key matrix (3x4) = (3x4) * (4x4)
V = np.dot(x, W_v)  # Value matrix (3x4) = (3x4) * (4x4)

# Step 4: Compute Attention Scores
# Attention score formula: (QK^T) / sqrt(d_k)
# (QK^T) measures similarity between queries and keys
d_k = K.shape[-1]  # Number of features (d_k = 4 in this case)
attention_scores = np.dot(Q, K.T) / np.sqrt(d_k)  # (3x4) * (4x3) -> (3x3)

# Step 5: Apply Softmax to Convert Scores to Probabilities
# Softmax ensures attention weights sum to 1 (per row)
attention_weights = np.exp(attention_scores) / np.sum(np.exp(attention_scores), axis=1, keepdims=True)

# Step 6: Compute Weighted Sum of Values (Final Self-Attention Output)
# Output = Attention Weights * Value Matrix (V)
output = np.dot(attention_weights, V)  # (3x3) * (3x4) -> (3x4)

# Step 7: Print Results
print("Attention Weights:\n", attention_weights)  # Shows how much attention each word gives to others
print("Self-Attention Output:\n", output)  # Final transformed word representations


Attention Weights:
 [[0.17813372 0.2423965  0.57946979]
 [0.01497999 0.13401896 0.85100104]
 [0.03379677 0.13755702 0.8286462 ]]
Self-Attention Output:
 [[1.12174014 1.85978999 1.80745647 0.77517572]
 [1.2639278  2.07916732 1.92766231 0.83544167]
 [1.25675581 2.06554595 1.90458861 0.82646735]]


In [2]:
import numpy as np  # Import NumPy for matrix operations

# Step 1: Define Input (Word Embeddings)
# Each row represents a word, each column represents a feature.
# We have 3 words, each with 4 features.
x = np.array([[5, 0, 5, 0],
              [0, 5, 0, 5],
              [1, 1, 1, 1]])

# Step 2: Initialize Weight Matrices (Random Values)
# These matrices transform input into Queries (Q), Keys (K), and Values (V)
W_q = np.random.rand(4, 4)  # Query weight matrix (4x4)
W_k = np.random.rand(4, 4)  # Key weight matrix (4x4)
W_v = np.random.rand(4, 4)  # Value weight matrix (4x4)

# Step 3: Compute Q, K, V Matrices
# Q, K, V are computed by multiplying input (x) with respective weight matrices.
Q = np.dot(x, W_q)  # Query matrix (3x4) = (3x4) * (4x4)
K = np.dot(x, W_k)  # Key matrix (3x4) = (3x4) * (4x4)
V = np.dot(x, W_v)  # Value matrix (3x4) = (3x4) * (4x4)

# Step 4: Compute Attention Scores
# Attention score formula: (QK^T) / sqrt(d_k)
# (QK^T) measures similarity between queries and keys
d_k = K.shape[-1]  # Number of features (d_k = 4 in this case)
attention_scores = np.dot(Q, K.T) / np.sqrt(d_k)  # (3x4) * (4x3) -> (3x3)

# Step 5: Apply Softmax to Convert Scores to Probabilities
# Softmax ensures attention weights sum to 1 (per row)
attention_weights = np.exp(attention_scores) / np.sum(np.exp(attention_scores), axis=1, keepdims=True)

# Step 6: Compute Weighted Sum of Values (Final Self-Attention Output)
# Output = Attention Weights * Value Matrix (V)
output = np.dot(attention_weights, V)  # (3x3) * (3x4) -> (3x4)

# Step 7: Print Results
print("Attention Weights:\n", attention_weights)  # Shows how much attention each word gives to others
print("Self-Attention Output:\n", output)  # Final transformed word representations


Attention Weights:
 [[9.74908457e-01 2.50915434e-02 1.24095377e-14]
 [9.99959407e-01 4.05927900e-05 8.63314987e-17]
 [9.40157001e-01 5.98420411e-02 9.58067525e-07]]
Self-Attention Output:
 [[7.04376566 4.31312118 9.09001624 4.78743535]
 [7.04283851 4.41236383 9.15900864 4.80297518]
 [7.04504775 4.1754492  8.99430469 4.76587572]]
