# Transformer Architecture — Python Demo
This notebook demonstrates a minimal Transformer model implementation using PyTorch.

In [15]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


## Positional Encoding

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)  <-- changed (was unsqueeze(0))
        self.register_buffer('pe', pe)

    def forward(self, x):  # x: (seq_len, batch, d_model)
        seq_len = x.size(0)
        return x + self.pe[:seq_len, :, :]  # (seq_len, 1, d_model) broadcasts over batch


## Minimal Transformer Model

In [17]:

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        # src expected shape: (seq_len, batch_size)
        emb = self.embedding(src)  # (seq_len, batch_size, d_model)
        # scale embeddings by sqrt(d_model) as in the original transformer
        embedded = emb * math.sqrt(emb.size(-1))
        embedded = self.pos_encoder(embedded)
        output = self.transformer_encoder(embedded)
        output = self.fc_out(output)
        return output


## Example Usage

In [18]:

# Example: random token IDs
vocab_size = 100
src = torch.randint(0, vocab_size, (10, 32))  # sequence length 10, batch size 32

model = TransformerModel(vocab_size)
out = model(src)
print(out.shape)  # [seq_len, batch_size, vocab_size]


torch.Size([10, 32, 100])


In [None]:
import iwslt2017
import inspect
print(inspect.getsourcefile(iwslt2017))


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement iwslt2017 (from versions: none)
ERROR: No matching distribution found for iwslt2017


ModuleNotFoundError: No module named 'iwslt2017'