**Using Numpy**

In [None]:
import numpy as np

# Seed for reproducibility
# np.random.seed(42)

In [None]:
# Define an input matrix (3 words, 4-dimensional embeddings)
X = np.array([[0.2, 0.8, 0.5, 0.1],  # Word 1
              [0.9, 0.4, 0.7, 0.3],  # Word 2
              [0.5, 0.1, 0.2, 0.9]]) # Word 3

In [None]:
# Define trainable weight matrices (random initialization)
d_k = X.shape[1]  # Dimension of word embeddings (4)

W_q = np.random.rand(d_k, d_k)  # Query weight matrix
W_k = np.random.rand(d_k, d_k)  # Key weight matrix
W_v = np.random.rand(d_k, d_k)  # Value weight matrix

# Compute Q, K, V matrices
Q = np.dot(X, W_q)
K = np.dot(X, W_k)
V = np.dot(X, W_v)

print("Query Matrix:\n", Q)
print("Key Matrix:\n", K)
print("Value Matrix:\n", V)


Query Matrix:
 [[0.58352471 0.69020868 0.22134042 1.31596799]
 [1.07000686 1.47739322 0.75098463 1.61922134]
 [1.0722933  0.82367632 0.5395647  0.74499289]]
Key Matrix:
 [[0.83760721 0.6137794  0.4806961  0.62550493]
 [1.01553254 1.09163663 0.82764348 0.81977231]
 [0.83769352 0.47516817 0.8319121  0.43856935]]
Value Matrix:
 [[0.34359845 0.58175548 0.7888782  1.02046847]
 [0.3434527  1.43844635 1.26034733 1.6961633 ]
 [0.3202968  1.17951542 0.83865699 1.09813922]]


In [None]:
# Compute raw attention scores (QK^T)
attention_scores = np.dot(Q, K.T)

# Scale by sqrt(d_k)
attention_scores /= np.sqrt(d_k)

print("Raw Attention Scores:\n", attention_scores)


Raw Attention Scores:
 [[0.92097116 1.30401524 0.78902953]
 [1.58843265 2.32417184 1.46662105]
 [1.06453977 1.52269786 1.03261968]]


In [None]:
# Apply softmax to normalize scores
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

attention_weights = softmax(attention_scores)

print("Attention Weights:\n", attention_weights)


Attention Weights:
 [[0.29912045 0.43873273 0.26214682]
 [0.25174083 0.52538919 0.22286997]
 [0.28171054 0.44542914 0.27286032]]


In [None]:
# Multiply weights by Value (V)
attention_output = np.dot(attention_weights, V)

print("Final Attention Output:\n", attention_output)


Final Attention Output:
 [[0.33742605 1.11431467 1.00877649 1.33727905]
 [0.33832864 1.16507435 1.04767718 1.39278171]
 [0.33717543 1.12645553 1.01246695 1.3426359 ]]




```
# This is formatted as code
```

**Using PyTorch**

In [None]:
import torch
import torch.nn.functional as F


# Define input (batch_size=1, seq_len=3, embedding_dim=4)
X_torch = torch.tensor(X, dtype=torch.float32)

In [None]:
X_torch

tensor([[0.2000, 0.8000, 0.5000, 0.1000],
        [0.9000, 0.4000, 0.7000, 0.3000],
        [0.5000, 0.1000, 0.2000, 0.9000]])

In [None]:
# Define weight matrices
W_q = torch.rand((d_k, d_k), requires_grad=True)
W_k = torch.rand((d_k, d_k), requires_grad=True)
W_v = torch.rand((d_k, d_k), requires_grad=True)

# Compute Q, K, V
Q = X_torch @ W_q
K = X_torch @ W_k
V = X_torch @ W_v

print("Query Matrix (PyTorch):\n", Q)
print("Key Matrix (PyTorch):\n", K)
print("Value Matrix (PyTorch):\n", V)


Query Matrix (PyTorch):
 tensor([[1.0462, 0.9843, 0.9905, 0.4263],
        [1.3142, 1.7460, 1.3134, 1.1299],
        [0.9985, 1.5294, 0.5645, 0.8287]], grad_fn=<MmBackward0>)
Key Matrix (PyTorch):
 tensor([[1.3015, 0.8496, 0.6376, 0.7813],
        [1.7786, 1.1751, 0.9665, 0.6145],
        [1.1877, 0.6539, 0.9527, 0.6404]], grad_fn=<MmBackward0>)
Value Matrix (PyTorch):
 tensor([[0.4930, 0.6533, 1.1594, 1.0376],
        [1.1325, 1.0038, 1.4088, 1.2255],
        [0.9825, 0.6386, 0.5909, 0.7084]], grad_fn=<MmBackward0>)


In [None]:
# Compute scaled attention scores
attention_scores = (Q @ K.T) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

# Apply softmax
attention_weights = F.softmax(attention_scores, dim=-1)

# Compute final output
attention_output = attention_weights @ V

print("Final Attention Output (PyTorch):\n", attention_output)

Final Attention Output (PyTorch):
 tensor([[0.9193, 0.8123, 1.1254, 1.0381],
        [0.9365, 0.8326, 1.1611, 1.0613],
        [0.9142, 0.8164, 1.1419, 1.0482]], grad_fn=<MmBackward0>)
