<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Complete_Transformer_Encoder_Layer_with_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn

class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_size, num_heads, ff_hidden_size, dropout=0.1):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=num_heads, dropout=dropout, batch_first=False)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, ff_hidden_size),
            nn.ReLU(),
            nn.Linear(ff_hidden_size, embed_size)
        )

        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None):
        # Multi-head attention (self-attention)
        attn_output, _ = self.attention(x, x, x, attn_mask=attn_mask)
        x = self.norm1(x + self.dropout(attn_output))  # Residual + norm

        # Feed-forward network
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))  # Residual + norm

        return x

# --- Example usage ---
if __name__ == "__main__":
    embed_size = 64
    num_heads = 8
    ff_hidden_size = 256
    dropout = 0.1
    seq_length = 10
    batch_size = 5

    model = TransformerEncoderLayer(embed_size, num_heads, ff_hidden_size, dropout)

    # Input: [seq_length, batch_size, embed_size]
    x = torch.rand(seq_length, batch_size, embed_size)

    # Generate a random attention mask: shape [seq_length, seq_length]
    # True = allowed, False = masked
    mask = torch.rand(seq_length, seq_length) < 0.5
    attn_mask = ~mask  # nn.MultiheadAttention expects masked positions as True

    output = model(x, attn_mask=attn_mask)
    print("Transformer Encoder Output Shape:", output.shape)