<a href="https://colab.research.google.com/github/Y326s/ECE6397HW/blob/main/Transformer_Programming_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
TRANSFORMER ARCHITECTURE ASSIGNMENT
=====================================
Learning Objective: Understand the key components of transformer models

In this assignment, you'll implement simplified versions of the core
transformer components: Self-Attention, Multi-Head Attention, and a
basic Transformer Block.

PART 1: Self-Attention Mechanism
---------------------------------------------
The self-attention mechanism allows each position in a sequence to attend
to all positions. It computes attention scores using Query, Key, and Value matrices.

Formula: Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V

Complete the self_attention() function below.
"""

import numpy as np

def softmax(x):
    """Helper function: Compute softmax along the last dimension."""
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def self_attention(Q, K, V):
    """
    Implement scaled dot-product attention.

    Args:
        Q: Query matrix of shape (seq_len, d_k)
        K: Key matrix of shape (seq_len, d_k)
        V: Value matrix of shape (seq_len, d_v)

    Returns:
        output: Attention output of shape (seq_len, d_v)
        attention_weights: Attention weights of shape (seq_len, seq_len)

    TODO: Implement the following steps:
    1. Compute attention scores by matrix multiplication of Q and K^T
    2. Scale the scores by dividing by sqrt(d_k)
    3. Apply softmax to get attention weights
    4. Multiply attention weights with V to get output
    """
    d_k = Q.shape[-1]

    # YOUR CODE HERE
    # Step 1: Compute QK^T
    scores = None  # Replace with your implementation

    # Step 2: Scale by sqrt(d_k)
    scaled_scores = None  # Replace with your implementation

    # Step 3: Apply softmax
    attention_weights = None  # Replace with your implementation

    # Step 4: Multiply with V
    output = None  # Replace with your implementation

    return output, attention_weights


In [None]:

"""
PART 2: Multi-Head Attention
-----------------------------------------
Multi-head attention runs multiple attention mechanisms in parallel,
allowing the model to focus on different aspects of the input.

Complete the multi_head_attention() function below.
"""

def multi_head_attention(X, num_heads, d_model):
    """
    Implement multi-head attention (simplified version).

    Args:
        X: Input matrix of shape (seq_len, d_model)
        num_heads: Number of attention heads
        d_model: Model dimension (must be divisible by num_heads)

    Returns:
        output: Multi-head attention output of shape (seq_len, d_model)

    TODO: Implement the following steps:
    1. Check that d_model is divisible by num_heads
    2. Calculate d_k (dimension per head)
    3. Create random projection matrices for Q, K, V for each head
    4. For each head:
       - Project X to get Q, K, V
       - Apply self_attention
       - Store the result
    5. Concatenate all head outputs
    6. Apply final linear projection (use random matrix)
    """
    seq_len = X.shape[0]

    # YOUR CODE HERE
    # Step 1 & 2: Verify divisibility and compute d_k
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
    d_k = None  # Replace with your implementation

    # Step 3: Initialize projection matrices (we'll use random matrices for simplicity)
    np.random.seed(42)
    W_q = [np.random.randn(d_model, d_k) for _ in range(num_heads)]
    W_k = [np.random.randn(d_model, d_k) for _ in range(num_heads)]
    W_v = [np.random.randn(d_model, d_k) for _ in range(num_heads)]

    # Step 4: Process each head
    head_outputs = []
    for i in range(num_heads):
        # Project X to get Q, K, V for this head
        Q = None  # Replace with your implementation
        K = None  # Replace with your implementation
        V = None  # Replace with your implementation

        # Apply attention
        head_output, _ = self_attention(Q, K, V)
        head_outputs.append(head_output)

    # Step 5: Concatenate heads
    multi_head_output = None  # Replace: concatenate head_outputs along last dimension

    # Step 6: Final linear projection
    W_o = np.random.randn(d_model, d_model)
    output = None  # Replace with your implementation

    return output


In [None]:


"""
PART 3: Testing Your Implementation
------------------------------------------------
Write test code to verify your implementations work correctly.

TODO:
1. Create sample input data (e.g., seq_len=4, d_model=8)
2. Test self_attention with Q, K, V matrices
3. Verify attention weights sum to 1
4. Test multi_head_attention with 2 heads
5. Print shapes and sample outputs
"""

def test_transformer_components():
    """
    Test your self_attention and multi_head_attention implementations.

    TODO: Implement tests that:
    - Create sample input data
    - Call self_attention and verify output shape
    - Verify attention weights sum to 1 for each query position
    - Call multi_head_attention and verify output shape
    - Print results
    """
    print("Testing Transformer Components")
    print("=" * 50)

    # YOUR CODE HERE
    # Test 1: Self-Attention
    print("\nTest 1: Self-Attention")
    seq_len = 4
    d_k = 8

    # Create sample Q, K, V matrices
    np.random.seed(42)
    Q = None  # Create a (seq_len, d_k) matrix
    K = None  # Create a (seq_len, d_k) matrix
    V = None  # Create a (seq_len, d_k) matrix

    # Call self_attention
    # output, attention_weights = self_attention(Q, K, V)

    # Print shapes and verify
    # print(f"Output shape: {output.shape}")
    # print(f"Attention weights shape: {attention_weights.shape}")
    # print(f"Attention weights sum (should be ~1.0): {attention_weights.sum(axis=1)}")

    # Test 2: Multi-Head Attention
    print("\nTest 2: Multi-Head Attention")
    # YOUR CODE HERE



In [None]:


"""
-----------------------------------------
Implement a simple positional encoding function that adds position
information to input embeddings.

Formula:
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
"""

def positional_encoding(seq_len, d_model):
    """
    Generate positional encoding matrix.

    Args:
        seq_len: Sequence length
        d_model: Model dimension

    Returns:
        PE: Positional encoding matrix of shape (seq_len, d_model)
    """
    # YOUR CODE HERE
    pass



In [None]:

"""
SUBMISSION INSTRUCTIONS
-----------------------
1. Complete all TODO sections
2. Test your code with the test function
3. Answer these questions in comments:
   a) Why do we scale attention scores by sqrt(d_k)?
   b) What is the advantage of multi-head attention over single-head?
   c) Why do transformers need positional encoding?

4. Include example output from your tests
"""

if __name__ == "__main__":
    # Run your tests here
    test_transformer_components()

    # Answer the questions:
    """
    Q1: Why do we scale attention scores by sqrt(d_k)?
    YOUR ANSWER:

    Q2: What is the advantage of multi-head attention over single-head?
    YOUR ANSWER:

    Q3: Why do transformers need positional encoding?
    YOUR ANSWER:
    """