In [1]:


import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Simple example: computing attention scores
# Imagine we have 3 words: ["cat", "sat", "mat"]
# We want to know how much "cat" attends to "sat" and "mat"

# Represent words as vectors (embeddings)
word_embeddings = torch.tensor([
    [1.0, 2.0],  # "cat"
    [0.5, 1.5],  # "sat"
    [1.5, 0.5],  # "mat"
])

# Query: what we're looking for (the word "cat")
query = word_embeddings[0]  # "cat"

# Keys: what we're comparing against (all words)
keys = word_embeddings

# Compute attention scores: dot product between query and each key
attention_scores = torch.matmul(query, keys.T)

print("Word embeddings:")
print(word_embeddings)
print(f"\nQuery (cat): {query}")
print(f"\nAttention scores:")
print(f"  cat -> cat: {attention_scores[0]:.2f}")
print(f"  cat -> sat: {attention_scores[1]:.2f}")
print(f"  cat -> mat: {attention_scores[2]:.2f}")

# Normalize with softmax to get attention weights
attention_weights = F.softmax(attention_scores, dim=-1)
print(f"\nAttention weights (after softmax):")
print(f"  cat -> cat: {attention_weights[0]:.3f}")
print(f"  cat -> sat: {attention_weights[1]:.3f}")
print(f"  cat -> mat: {attention_weights[2]:.3f}")
print(f"\nSum: {attention_weights.sum():.3f} (should be 1.0)")

Word embeddings:
tensor([[1.0000, 2.0000],
        [0.5000, 1.5000],
        [1.5000, 0.5000]])

Query (cat): tensor([1., 2.])

Attention scores:
  cat -> cat: 5.00
  cat -> sat: 3.50
  cat -> mat: 2.50

Attention weights (after softmax):
  cat -> cat: 0.766
  cat -> sat: 0.171
  cat -> mat: 0.063

Sum: 1.000 (should be 1.0)


In [8]:
# Implementing scaled dot-product attention
def scaled_dot_product_attention(Q, K, V):
    """
    Compute scaled dot-product attention
    
    Args:
        Q: Query tensor [batch_size, seq_len, d_k]
        K: Key tensor [batch_size, seq_len, d_k]
        V: Value tensor [batch_size, seq_len, d_v]
    
    Returns:
        Output tensor and attention weights
    """
    d_k = Q.size(-1)
    
    # Compute attention scores: QK^T
    scores = torch.matmul(Q, K.transpose(-2, -1))
    
    # Scale by sqrt(d_k)
    scores = scores / np.sqrt(d_k)
    
    # Apply softmax to get attention weights
    attention_weights = F.softmax(scores, dim=-1)
    
    # Apply weights to values
    output = torch.matmul(attention_weights, V)
    
    return output, attention_weights

# Example usage
batch_size, seq_len, d_k = 1, 4, 8

Q = torch.randn(batch_size, seq_len, d_k)
K = torch.randn(batch_size, seq_len, d_k)
v = torch.randn(batch_size, seq_len, d_k)
output, attn_weights = scaled_dot_product_attention(Q, K, V)

print(f"Input shape - Q: {Q.shape}, K: {K.shape}, V: {V.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attn_weights.shape}")
print(f"\nAttention weights (first sequence):")
print(attn_weights[0])
print(f"\nEach row sums to 1: {attn_weights[0].sum(dim=-1)}")


NameError: name 'V' is not defined