In [6]:
#Question 1 :Compute Scaled Dot-Product Attention (Python)
import numpy as np

def softmax(x):
    exp_x = np.exp(x - np.max(x))  # stability
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    """
    Computes scaled dot-product attention.

    Q: Query matrix      shape → (n_queries, d_k)
    K: Key matrix        shape → (n_keys, d_k)
    V: Value matrix      shape → (n_keys, d_v)

    Returns:
       attention_weights: (n_queries, n_keys)
       context_vector:    (n_queries, d_v)
    """

    # Step 1: QK^T
    scores = np.dot(Q, K.T)

    # Step 2: scale by sqrt(d_k)
    d_k = K.shape[-1]
    scaled_scores = scores / np.sqrt(d_k)

    # Step 3: softmax normalization (row-wise)
    attention_weights = softmax(scaled_scores)

    # Step 4: multiply by V
    context_vector = np.dot(attention_weights, V)

    return attention_weights, context_vector


# ---------------- EXAMPLE ----------------
if __name__ == "__main__":
    Q = np.array([[1, 0, 1]])
    K = np.array([[1, 0, 1],
                  [0, 1, 0]])
    V = np.array([[1, 2],
                  [3, 4]])

    attn, context = scaled_dot_product_attention(Q, K, V)

    print("Attention Weights:\n", attn)
    print("Context Vector:\n", context)


Attention Weights:
 [[0.76036844 0.23963156]]
Context Vector:
 [[1.47926312 2.47926312]]


In [7]:
# Question 2 :  Implement Simple Transformer Encoder Block (PyTorch)
import torch
import torch.nn as nn

class SimpleTransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=128, n_heads=4, d_ff=512, dropout=0.1):
        """
        d_model: embedding dimension
        n_heads: number of attention heads
        d_ff: hidden dimension of feed-forward network
        """
        super(SimpleTransformerEncoderBlock, self).__init__()

        # Multi-head self-attention layer
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            batch_first=True  # input/output shape: (batch, seq_len, d_model)
        )

        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        # LayerNorm + Dropout for both sub-layers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None, key_padding_mask=None):
        """
        x: (batch_size, seq_len, d_model)
        attn_mask: optional attention mask (seq_len, seq_len) or (batch, seq_len, seq_len)
        key_padding_mask: (batch_size, seq_len) with True for PAD positions
        """
        # ----- Sub-layer 1: Multi-head self-attention -----
        attn_output, _ = self.self_attn(
            query=x,
            key=x,
            value=x,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask
        )
        # Residual connection + LayerNorm
        x = self.norm1(x + self.dropout1(attn_output))

        # ----- Sub-layer 2: Feed-Forward Network -----
        ffn_output = self.ffn(x)
        # Residual connection + LayerNorm
        x = self.norm2(x + self.dropout2(ffn_output))

        return x


if __name__ == "__main__":
    # Dimensions from the assignment
    batch_size = 32
    seq_len = 10
    d_model = 128
    n_heads = 4
    d_ff = 512

    # Dummy input: batch of 32 sentences, each with 10 tokens
    x = torch.randn(batch_size, seq_len, d_model)

    encoder_block = SimpleTransformerEncoderBlock(
        d_model=d_model,
        n_heads=n_heads,
        d_ff=d_ff
    )

    out = encoder_block(x)
    print("Output shape:", out.shape)  # Expected: torch.Size([32, 10, 128])

Output shape: torch.Size([32, 10, 128])
