<a href="https://colab.research.google.com/github/Papa-Panda/random_thoughts/blob/main/Tranformer_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=0.1)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [None]:
# Define the Transformer model
class Transformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoding = PositionalEncoding(embedding_dim)
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        x = x.permute(1, 0, 2)
        x = self.encoder(x)
        x = x.permute(1, 0, 2)
        x = self.decoder(x)
        return x

In [None]:
# Define the input and output sequences
input_seq = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
output_seq = torch.tensor([[4, 5, 6], [7, 8, 9], [10, 11, 12]])
# output_seq = torch.tensor([[12, 15, 18], [21, 24, 27], [30, 33, 36]])

In [None]:
# Initialize the model and optimizer
model = Transformer(vocab_size=13, embedding_dim=16, hidden_dim=32, num_layers=2, num_heads=2)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 5000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(input_seq)
    loss = F.cross_entropy(output.view(-1, 13), output_seq.view(-1))
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, loss: {loss.item():.4f}")

Epoch 0, loss: 2.9981
Epoch 100, loss: 0.7456
Epoch 200, loss: 0.3032
Epoch 300, loss: 0.1172
Epoch 400, loss: 0.0720
Epoch 500, loss: 0.0581
Epoch 600, loss: 0.0250
Epoch 700, loss: 0.0193
Epoch 800, loss: 0.0146
Epoch 900, loss: 0.0103
Epoch 1000, loss: 0.0086
Epoch 1100, loss: 0.0134
Epoch 1200, loss: 0.0083
Epoch 1300, loss: 0.0056
Epoch 1400, loss: 0.0050
Epoch 1500, loss: 0.0039
Epoch 1600, loss: 0.0032
Epoch 1700, loss: 0.0041
Epoch 1800, loss: 0.0027
Epoch 1900, loss: 0.0038
Epoch 2000, loss: 0.0030
Epoch 2100, loss: 0.0023
Epoch 2200, loss: 0.0016
Epoch 2300, loss: 0.0012
Epoch 2400, loss: 0.0013
Epoch 2500, loss: 0.0010
Epoch 2600, loss: 0.0011
Epoch 2700, loss: 0.0017
Epoch 2800, loss: 0.0009
Epoch 2900, loss: 0.0009
Epoch 3000, loss: 0.0008
Epoch 3100, loss: 0.0009
Epoch 3200, loss: 0.0007
Epoch 3300, loss: 0.0006
Epoch 3400, loss: 0.0010
Epoch 3500, loss: 0.0005
Epoch 3600, loss: 0.0005
Epoch 3700, loss: 0.0005
Epoch 3800, loss: 0.0016
Epoch 3900, loss: 0.0004
Epoch 4000, 

In [None]:
# Evaluate the model
with torch.no_grad():
    model.eval()
    test_input = torch.tensor([[6, 9, 8]])
    test_output = model(test_input).argmax(-1)
    print(f"Input: {test_input.tolist()}, Output: {test_output.tolist()}")

Input: [[6, 9, 8]], Output: [[9, 12, 11]]
