In [29]:
# We'll use numpy to handle the matrix operations
import numpy as np

# Given Q, K, V matrices 
V = np.array([
    [0.2, 0.8],
    [-0.2, 0.5],
    [-0.3, -0.4],
    [0.7, 0.7]
])

K = np.array([
    [0.2, 0.8],
    [-0.2, 0.5],
    [-0.3, -0.4],
    [0.7, 0.7]
])


computers_vector = np.array([[1, 0, 0, 0]])
are_vector = np.array([[0, 1, 0, 0]])
thinking_vector = np.array([[0, 0, 1, 0]])
machines_vector = np.array([[0, 0, 0, 1]])

print("Machines vector shape", machines_vector.shape)
# Calculate the attention scores by dotting the query vector for "machines" with all key vectors
# Correct the dot product by not transposing K since the matrices are already aligned
attention_q_dot_k = np.dot(machines_vector, K)

print("Attention score")
print(attention_q_dot_k)

# Scale the scores by sqrt(d_k), where d_k is the embedded dimension of the key vectors
scaling_factor = np.sqrt(K.shape[1])
print(f"\nScaling factor: {scaling_factor}")

scaled_attention_scores = attention_q_dot_k / scaling_factor

print("\nScaled attention score")
print(scaled_attention_scores)

# Apply softmax to get the attention probabilities
attention_probabilities = np.exp(scaled_attention_scores) / np.sum(np.exp(scaled_attention_scores))

print("\nSoftmax attention probabilities")
print(attention_probabilities)

# Multiply the attention probabilities with the value matrix and sum to get the output vector z for "machines"
z = np.dot(attention_probabilities, V.T)

print(attention_probabilities.T.shape)

print("\nOutput vector z for 'machines'")
print(z)


Machines vector shape (1, 4)
Attention score
[[0.7 0.7]]

Scaling factor: 1.4142135623730951

Scaled attention score
[[0.49497475 0.49497475]]

Softmax attention probabilities
[[0.5 0.5]]
(2, 1)

Output vector z for 'machines'
[[ 0.5   0.15 -0.35  0.7 ]]


In [39]:

import torch
from torch.nn.functional import softmax

x = [
    [1, 0, 0, 0], # Input 1
    [0, 1, 0, 0], # Input 2
    [0, 0, 1, 0], # Input 3
    [0, 0, 0, 1]  # Input 4
]

w_key = np.array([
    [0.2, 0.8],
    [-0.2, 0.5],
    [-0.3, -0.4],
    [0.7, 0.7]
])
w_query = np.array([
    [0.2, 0.8],
    [-0.2, 0.5],
    [-0.3, -0.4],
    [0.7, 0.7]
])
w_value = np.array([
    [0.2, 0.8],
    [-0.2, 0.5],
    [-0.3, -0.4],
    [0.7, 0.7]
])

x = torch.tensor(x, dtype=torch.float32)
w_key = torch.tensor(w_key, dtype=torch.float32)
w_query = torch.tensor(w_query, dtype=torch.float32)
w_value = torch.tensor(w_value, dtype=torch.float32)

keys = x @ w_key
querys = x @ w_query
values = x @ w_value

attn_scores = querys @ keys.T

attn_scores_softmax = softmax(attn_scores, dim=-1)

weighted_values = values[:,None] * attn_scores_softmax.T[:,:,None]

outputs = weighted_values.sum(dim=0)

print(outputs)

tensor([[0.2150, 0.5622],
        [0.1277, 0.4783],
        [0.0019, 0.2600],
        [0.2816, 0.5898]])
