In [1]:
# ============================================
# üß† Q3. Scaled Dot-Product Attention
# ============================================

!pip install torch --quiet
import torch
import torch.nn.functional as F
import math

# --------------------------------------------
# 1Ô∏è‚É£ Define the Scaled Dot-Product Attention
# --------------------------------------------
def scaled_dot_product_attention(Q, K, V):
    """
    Computes attention(Q,K,V) = softmax((QK^T)/‚àöd_k) * V
    Q, K, V: tensors of shape (batch, seq_len, d_k)
    Returns: (output, attention_weights)
    """
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
    weights = F.softmax(scores, dim=-1)
    output = torch.matmul(weights, V)
    return output, weights

# --------------------------------------------
# 2Ô∏è‚É£ Generate Random Q, K, V Inputs
# --------------------------------------------
torch.manual_seed(0)
batch_size = 1
seq_len = 4
d_k = 8

Q = torch.randn(batch_size, seq_len, d_k)
K = torch.randn(batch_size, seq_len, d_k)
V = torch.randn(batch_size, seq_len, d_k)

print("Q shape:", Q.shape)
print("K shape:", K.shape)
print("V shape:", V.shape)

# --------------------------------------------
# 3Ô∏è‚É£ Check Softmax Stability (Before / After Scaling)
# --------------------------------------------
raw_scores = torch.matmul(Q, K.transpose(-2, -1))
print("\nRaw attention scores (no scaling):\n", raw_scores[0])

scaled_scores = raw_scores / math.sqrt(d_k)
print("\nScaled attention scores (divided by ‚àöd_k):\n", scaled_scores[0])

# --------------------------------------------
# 4Ô∏è‚É£ Compute Attention
# --------------------------------------------
output, weights = scaled_dot_product_attention(Q, K, V)

print("\n‚úÖ Attention Weights (softmax over scaled scores):\n", weights[0])
print("\n‚úÖ Output Vectors:\n", output[0])

# --------------------------------------------
# 5Ô∏è‚É£ Verify Stability ‚Äî Observe numeric range
# --------------------------------------------
print("\nScore Range Comparison:")
print("Unscaled scores range:  ", raw_scores.min().item(), "‚Üí", raw_scores.max().item())
print("Scaled scores range:    ", scaled_scores.min().item(), "‚Üí", scaled_scores.max().item())

"""
üß† Notes for Report
- Formula: Attention(Q,K,V) = softmax((QK·µÄ)/‚àöd‚Çñ) V
- Dividing by ‚àöd‚Çñ prevents extremely large dot-products,
  which could cause the softmax to saturate and give unstable gradients.
- The attention weight matrix shows how each query attends to keys,
  and the weighted sum of V produces context-aware output vectors.
"""


Q shape: torch.Size([1, 4, 8])
K shape: torch.Size([1, 4, 8])
V shape: torch.Size([1, 4, 8])

Raw attention scores (no scaling):
 tensor([[-0.0146,  5.1091, -0.3921, -3.7779],
        [ 0.4648,  0.5446, -0.7034,  0.9571],
        [ 1.3816, -7.0338, -0.2771,  2.2690],
        [ 0.6756,  3.8463, -1.2603, -2.3062]])

Scaled attention scores (divided by ‚àöd_k):
 tensor([[-0.0052,  1.8063, -0.1386, -1.3357],
        [ 0.1643,  0.1926, -0.2487,  0.3384],
        [ 0.4885, -2.4868, -0.0980,  0.8022],
        [ 0.2389,  1.3599, -0.4456, -0.8154]])

‚úÖ Attention Weights (softmax over scaled scores):
 tensor([[0.1211, 0.7410, 0.1060, 0.0320],
        [0.2577, 0.2651, 0.1705, 0.3067],
        [0.3360, 0.0171, 0.1869, 0.4599],
        [0.2032, 0.6235, 0.1025, 0.0708]])

‚úÖ Output Vectors:
 tensor([[-1.2940, -0.1495, -0.3701, -0.2156, -0.9138,  1.0640, -0.0503,  0.6983],
        [-0.6700,  0.9147, -0.5610,  0.1849, -0.8407,  0.5956,  0.0283,  0.5789],
        [-0.3498,  1.4870, -0.6570,  0.4022,

'\nüß† Notes for Report\n- Formula: Attention(Q,K,V) = softmax((QK·µÄ)/‚àöd‚Çñ) V\n- Dividing by ‚àöd‚Çñ prevents extremely large dot-products,\n  which could cause the softmax to saturate and give unstable gradients.\n- The attention weight matrix shows how each query attends to keys,\n  and the weighted sum of V produces context-aware output vectors.\n'