In [1]:
import numpy as np

def softmax(x, axis=-1):
    # subtract max for numerical stability
    x_shifted = x - np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x_shifted)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Q, K, V: numpy arrays of shape (batch_size, seq_len, d_k)
    mask: optional numpy array broadcastable to (batch_size, seq_len, seq_len)
          with 0 for valid and 1 (or True) for masked positions
    Returns:
        attention_weights: (batch_size, seq_len, seq_len)
        context: (batch_size, seq_len, d_k)
    """
    d_k = K.shape[-1]

    # (batch, seq_q, d_k) @ (batch, d_k, seq_k) -> (batch, seq_q, seq_k)
    scores = np.matmul(Q, np.transpose(K, (0, 2, 1))) / np.sqrt(d_k)

    if mask is not None:
        # add large negative value to masked positions
        scores = scores + (mask * -1e9)

    attention_weights = softmax(scores, axis=-1)  # along seq_k

    # (batch, seq_q, seq_k) @ (batch, seq_k, d_k) -> (batch, seq_q, d_k)
    context = np.matmul(attention_weights, V)

    return attention_weights, context


In [2]:
batch_size, seq_len, d_k = 2, 4, 8
Q = np.random.randn(batch_size, seq_len, d_k)
K = np.random.randn(batch_size, seq_len, d_k)
V = np.random.randn(batch_size, seq_len, d_k)

attn_w, ctx = scaled_dot_product_attention(Q, K, V)
print(attn_w.shape)  # (2, 4, 4)
print(ctx.shape)     # (2, 4, 8)


(2, 4, 4)
(2, 4, 8)


In [3]:
import torch
import torch.nn as nn

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=512, num_heads=8, d_ff=2048, dropout=0.1):
        super().__init__()
        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=num_heads,
            batch_first=True  # (batch, seq, feature)
        )

        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        # LayerNorm and Dropout for the two sublayers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None, key_padding_mask=None):
        """
        x: (batch_size, seq_len, d_model)
        attn_mask: optional attention mask (seq_len, seq_len) or broadcastable
        key_padding_mask: (batch_size, seq_len) with True for PAD tokens
        """
        # Multi-head self-attention (Q=K=V=x)
        attn_output, _ = self.self_attn(
            x, x, x,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask
        )
        # Add & Norm
        x = self.norm1(x + self.dropout1(attn_output))

        # Feed-forward
        ffn_output = self.ffn(x)
        # Add & Norm
        x = self.norm2(x + self.dropout2(ffn_output))

        return x


In [5]:
import torch
import torch.nn as nn

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model=128, num_heads=8, d_ff=512, dropout=0.1):
        super().__init__()

        # a) Initialize Multi-Head Self-Attention with d_model = 128 and h = 8
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=num_heads,
            batch_first=True
        )

        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        # b) Residual connections + Layer Normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None, key_padding_mask=None):

        # ---- Multi-Head Self-Attention ----
        attn_output, _ = self.self_attn(
            x, x, x,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask
        )

        # b) Residual + LayerNorm
        x = self.norm1(x + self.dropout1(attn_output))

        # ---- Feed-Forward Network ----
        ffn_output = self.ffn(x)

        # b) Residual + LayerNorm
        x = self.norm2(x + self.dropout2(ffn_output))

        return x


In [6]:
# Test with batch_size = 32, seq_len = 10, d_model = 128
batch_size = 32
seq_len = 10
d_model = 128

encoder_block = TransformerEncoderBlock(d_model=d_model, num_heads=8, d_ff=512)

x = torch.randn(batch_size, seq_len, d_model)

out = encoder_block(x)

print(out.shape)


torch.Size([32, 10, 128])
