In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
if torch.cuda.is_available():
  device = 'cuda'
elif torch.backends.mps.is_available():
  device = 'mps'
else:
  device = 'cpu'
print(device)

cpu


# Transformer Architecture

## Positional encodings

In [3]:
class PositionalEncodings(nn.Module):
  def __init__(self, max_len, embed_dim, dropout=0.1):
    super().__init__()
    # pos_embed: learnable positional embeddings for all positions up to max_len
    # Shape = [max_len, embed_dim]
    # Example: if max_len=500 and embed_dim=512 → [500, 512]
    self.pos_embed = nn.Parameter(torch.randn(max_len, embed_dim) * 0.02)
    self.dropout = nn.Dropout(dropout)

  def forward(self, X):
    """
    X: token embeddings
    Shape = [batch_size, seq_len, embed_dim]

    self.pos_embed[:X.size(1)]:
        - X.size(1) = seq_len
        - So we take the first `seq_len` rows from pos_embed
        - Shape = [seq_len, embed_dim]

    Broadcasting when adding:
        - X: [batch_size, seq_len, embed_dim]
        - pos_embed[:seq_len]: [seq_len, embed_dim]
        - Automatically broadcast to [1, seq_len, embed_dim] → [batch_size, seq_len, embed_dim]

    Final output:
        - Shape = [batch_size, seq_len, embed_dim]
    """
    return self.dropout(X + self.pos_embed[:X.size(1)])



In [4]:
max_len = 500
embed_dim = 512
pos_embedding = PositionalEncodings(max_len, embed_dim)
embeddings = torch.randn(256, 500, 512)
embeddings_with_pos = pos_embedding(embeddings)
embeddings_with_pos.shape




torch.Size([256, 500, 512])