In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# Input tensor (3 sequences, 4 features)
x = tf.constant([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2]
], dtype=tf.float32)


In [3]:
# Projection matrices (from manual setup)
W_Q = tf.constant([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2]
], dtype=tf.float32)

W_K = W_Q  # Same weights for Keys

W_V = tf.constant([
    [0.1, 0.2],
    [0.3, 0.4],
    [0.5, 0.6],
    [0.7, 0.8]
], dtype=tf.float32)

In [4]:
# Compute Queries, Keys, and Values
queries = tf.matmul(x, W_Q)
keys = tf.matmul(x, W_K)
values = tf.matmul(x, W_V)

print("Queries Matrix:\n", queries)
print("Keys Matrix:\n", keys)
print("Values Matrix:\n", values)

Queries Matrix:
 tf.Tensor(
[[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]], shape=(3, 3), dtype=float32)
Keys Matrix:
 tf.Tensor(
[[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]], shape=(3, 3), dtype=float32)
Values Matrix:
 tf.Tensor(
[[0.5       0.6      ]
 [1.14      1.4000001]
 [1.78      2.2      ]], shape=(3, 2), dtype=float32)


In [5]:
# Attention scores: Q * K^T
scores = tf.matmul(queries, keys, transpose_b=True)
print("Attention Scores Matrix:\n", scores)

Attention Scores Matrix:
 tf.Tensor(
[[ 1.9400002  4.4680004  6.996001 ]
 [ 4.4680004 10.292     16.116001 ]
 [ 6.996001  16.116001  25.236002 ]], shape=(3, 3), dtype=float32)


In [6]:
# Scale scores (sqrt of dimension size, here sqrt(3))
scaled = scores / tf.sqrt(3.0)
print("Scaled Scores:\n", scaled)

# Softmax to get attention weights
weights = tf.nn.softmax(scaled, axis=-1)
print("Attention Weights:\n", weights)

# Final output: weighted sum of values
context = tf.matmul(weights, values)
print("Context vectors for each word:\n", context.numpy())


Scaled Scores:
 tf.Tensor(
[[ 1.1200596  2.5796013  4.039143 ]
 [ 2.5796013  5.942089   9.304578 ]
 [ 4.039143   9.304578  14.570013 ]], shape=(3, 3), dtype=float32)
Attention Weights:
 tf.Tensor(
[[4.1966923e-02 1.8062508e-01 7.7740800e-01]
 [1.1589993e-03 3.3449762e-02 9.6539128e-01]
 [2.6561420e-05 5.1404452e-03 9.9483299e-01]], shape=(3, 3), dtype=float32)
Context vectors for each word:
 [[1.6106822 1.9883529]
 [1.7571087 2.171386 ]
 [1.776676  2.1958451]]
