# Assignment Notebook: Attention + Transformer Encoder
This notebook includes solutions for:
- **Q1: Scaled Dot-Product Attention (NumPy)**
- **Q2: Simple Transformer Encoder Block (PyTorch)**
Run each cell with **Shift + Enter**.

## Q1. Scaled Dot-Product Attention (NumPy)

In [1]:
import numpy as np

def softmax(x):
    exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exps / np.sum(exps, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    scores = Q @ K.T
    d_k = K.shape[-1]
    scaled_scores = scores / np.sqrt(d_k)
    attention_weights = softmax(scaled_scores)
    context = attention_weights @ V
    return attention_weights, context

# Example test
Q = np.array([[1., 0., 1.]])
K = np.array([[1., 0., 1.],
              [0., 1., 0.]])
V = np.array([[5., 5.],
              [1., 1.]])

weights, context = scaled_dot_product_attention(Q, K, V)

weights, context

(array([[0.76036844, 0.23963156]]), array([[4.04147377, 4.04147377]]))

## Q2. Simple Transformer Encoder Block (PyTorch)

In [2]:
import torch
import torch.nn as nn

# Dimensions
d_model = 128
num_heads = 8
dim_ff = 512
dropout = 0.1

class SimpleTransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, dim_ff, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, attn_weights = self.self_attn(x, x, x)
        x = x + self.dropout1(attn_output)
        x = self.norm1(x)
        ffn_output = self.ffn(x)
        x = x + self.dropout2(ffn_output)
        x = self.norm2(x)
        return x, attn_weights

# Test with batch of 32, seq_len 10
batch_size = 32
seq_len = 10
x = torch.randn(batch_size, seq_len, d_model)

encoder = SimpleTransformerEncoderBlock(d_model, num_heads, dim_ff, dropout)
output, attn_weights = encoder(x)

output.shape, attn_weights.shape

(torch.Size([32, 10, 128]), torch.Size([32, 10, 10]))