# FUNTION

In [6]:
import numpy as np

In [7]:
def softmax(x, axis=-1):
    # Compute softmax values for each set of scores in x along specified axis and returns numpy array of same shape as x with softmax applied
    # Subtract max for numerical stability (prevents overflow)
    x_shifted = x - np.max(x, axis=axis, keepdims=True)
    
    # Compute exponentials
    exp_x = np.exp(x_shifted)
    
    # Normalize by sum to get probabilities
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)


def scaled_dot_product_attention(Q, K, V, mask=None):
        
        # Compute Scaled Dot-Product Attention.

        # Formula: Attention(Q, K, V) = softmax(Q · K^T / sqrt(d_k)) · V

        # Args:
        #     Q: Queries (batch_size, seq_len_q, d_k) – what to look for
        #     K: Keys    (batch_size, seq_len_k, d_k) – what is available
        #     V: Values  (batch_size, seq_len_k, d_v) – info to retrieve
        #     mask: Optional boolean mask indicating allowed positions

        # Returns:
        #     output: Weighted sum of values (batch_size, seq_len_q, d_v)
        #     attention_weights: Probabilities for each query-key pair (batch_size, seq_len_q, seq_len_k)
        

    
    
    # Step 1: Compute raw attention scores (Q · K^T)
    # This measures similarity between each query and each key
    # Result shape: (batch_size, seq_len_q, seq_len_k)
    attention_scores = np.matmul(Q, K.transpose(0, 2, 1))
    
    # Step 2: Scale by square root of key dimension (d_k)
    # Scaling prevents dot products from growing too large
    # Large values can push softmax into regions with tiny gradients
    d_k = K.shape[-1]
    scaled_scores = attention_scores / np.sqrt(d_k)
    
    # Step 3: Apply mask (optional)
    # Set masked positions to very large negative value
    # After softmax, these become ~0 probability
    if mask is not None:
        # Where mask is False/0, replace with large negative number
        scaled_scores = np.where(mask, scaled_scores, -1e9)
    
    # Step 4: Apply softmax to get attention weights (probabilities)
    # Each row now sums to 1.0 across seq_len_k dimension
    # This converts similarity scores to a probability distribution
    attention_weights = softmax(scaled_scores, axis=-1)
    
    # Step 5: Compute weighted sum of values
    # Each output position is a weighted average of all value vectors
    # Weights determined by attention_weights
    # Result shape: (batch_size, seq_len_q, d_v)
    output = np.matmul(attention_weights, V)
    
    return output, attention_weights

# DEMO 

In [None]:

def main():
    
    
    np.random.seed(42)
    
    # Define dimensions
    batch_size = 2
    seq_len = 4
    d_k = 8  # dimension of queries and keys
    d_v = 8  # dimension of values
    
    print(f"Batch size: {batch_size}")
    print(f"Sequence length: {seq_len}")
    print(f"Key/Query dimension (d_k): {d_k}")
    print(f"Value dimension (d_v): {d_v}")
    
    # Create sample Q, K, V matrices with small random values
    Q = np.random.randn(batch_size, seq_len, d_k) * 0.1
    K = np.random.randn(batch_size, seq_len, d_k) * 0.1
    V = np.random.randn(batch_size, seq_len, d_v) * 0.1
    
    print(f"Q shape: {Q.shape}")
    print(f"K shape: {K.shape}")
    print(f"V shape: {V.shape}")
    
    # Example 1: Without mask
    
    print(" Attention without mask")
    
    
    output, attention_weights = scaled_dot_product_attention(Q, K, V)
    
    print(f"\nOutput shape: {output.shape}")
    print(f"Attention weights shape: {attention_weights.shape}")
    
    print("\nAttention weights (first batch):")
    print(attention_weights[0])
    print(f"Row sums: {attention_weights[0].sum(axis=1)}")
    
    print("\nOutput (first batch, first sequence position):")
    print(output[0, 0])
    
    # Example 2: With mask (e.g., masking future positions)

    print("\n" + "-" * 60)
    print("Example 2: Attention with causal mask")
    print("(Each position can only attend to itself and previous positions)")
    
    # Create causal mask: lower triangular matrix
    # True for positions that should be attended to
    causal_mask = np.tril(np.ones((seq_len, seq_len)), k=0).astype(bool)
    # Expand to batch dimension
    causal_mask = np.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    
    print("\nCausal mask (first batch):")
    print(causal_mask[0].astype(int))
    
    output_masked, attention_weights_masked = scaled_dot_product_attention(
        Q, K, V, mask=causal_mask
    )
    
    print("\nAttention weights with causal mask (first batch):")
    print(attention_weights_masked[0])
    print("\nNote: Upper triangle is 0 (masked positions)")
    
    


if __name__ == "__main__":
    main()


Batch size: 2
Sequence length: 4
Key/Query dimension (d_k): 8
Value dimension (d_v): 8
Q shape: (2, 4, 8)
K shape: (2, 4, 8)
V shape: (2, 4, 8)
 Attention without mask

Output shape: (2, 4, 8)
Attention weights shape: (2, 4, 4)

Attention weights (first batch):
[[0.25239951 0.24751685 0.25106345 0.24902019]
 [0.24893051 0.25202596 0.24813962 0.25090392]
 [0.24710991 0.25421605 0.24841189 0.25026215]
 [0.25241656 0.24979312 0.24903318 0.24875714]]
Row sums: [1. 1. 1. 1.]

Output (first batch, first sequence position):
[-0.02728092  0.00473303 -0.04275996 -0.07967607  0.03838312  0.06356303
 -0.08637104  0.06873783]

------------------------------------------------------------
Example 2: Attention with causal mask
(Each position can only attend to itself and previous positions)

Causal mask (first batch):
[[1 0 0 0]
 [1 1 0 0]
 [1 1 1 0]
 [1 1 1 1]]

Attention weights with causal mask (first batch):
[[1.         0.         0.         0.        ]
 [0.49691046 0.50308954 0.         0.     