<div align="center">
  <img src="https://www.researchgate.net/profile/Ehsan-Amjadian/publication/352239001/figure/fig1/AS:1033334390013952@1623377525434/Detailed-view-of-a-transformer-encoder-block-It-first-passes-the-input-through-an.jpg" alt="Transformer Encoder" width="300">
</div>

In [12]:
import torch
import torch.nn as nn

In [None]:
# Importing classes from the respective notebooks
%run 1_Embeddings.ipynb
%run 2_Positional_Encoding.ipynb
%run 4_Multihead_Attention.ipynb

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden_dim, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(embed_dim, num_heads, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual_x = x  # Residual connection before attention
        attn_out = self.attn(x)
        x = self.norm1(residual_x + self.dropout(attn_out))
        
        residual_x = x  # Residual connection before feedforward
        ff_out = self.ff(x)
        x = self.norm2(residual_x + self.dropout(ff_out))
        
        return x

In [15]:
class TransformerEncoder(nn.Module):
    """
    Args:
        seq_len : length of input sequence
        vocab_size: size of vocabulary
        embed_dim: dimension of embedding
        num_layers: number of encoder layers
        ff_hidden_dim: feed forward hidden layer dimension
        n_heads: number of heads in multihead attention
        
    Returns:
        out: output of the encoder
    """
    def __init__(self, max_seq_len, vocab_size, embed_dim, num_heads, ff_hidden_dim, num_layers, dropout=0.1):
        super().__init__()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(embed_dim, max_seq_len)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_hidden_dim, dropout)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)

In [16]:
max_seq_len = 200
vocab_size = 1000
embed_dim = 512
num_heads = 8
batch_size = 32
ffn_hidden = 2048
num_layers = 6
drop_prob = 0.2

encoder = TransformerEncoder(max_seq_len, vocab_size, embed_dim, num_heads, ffn_hidden, num_layers, drop_prob)

In [18]:
encoder

TransformerEncoder(
  (embedding): Embedding(
    (embed): Embedding(1000, 512)
  )
  (pos_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-5): 6 x TransformerBlock(
      (attn): MultiHeadAttention(
        (q_linear): Linear(in_features=512, out_features=512, bias=True)
        (k_linear): Linear(in_features=512, out_features=512, bias=True)
        (v_linear): Linear(in_features=512, out_features=512, bias=True)
        (out_proj): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=512, bias=True)
        (3): Dropout(p=0.2, inplace=False)
      )
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (norm): Lay

In [20]:
total_params = sum(p.numel() for p in encoder.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {total_params}")

Total Trainable Parameters: 19427328
