<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Transformers_in_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math

# Define Transformer Model
class Transformer(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, n_layers, output_dim, max_seq_len=5000):
        super(Transformer, self).__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = self.create_positional_encoding(emb_dim, max_seq_len)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=n_heads, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=n_layers
        )

        self.fc_out = nn.Linear(emb_dim, output_dim)

    def create_positional_encoding(self, emb_dim, max_seq_len):
        pe = torch.zeros(max_seq_len, emb_dim)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # Add batch dimension

    def forward(self, x):
        # Embed input and add positional encoding
        embedded = self.embedding(x)  # (batch_size, seq_len, emb_dim)
        x = embedded + self.positional_encoding[:, :x.size(1), :].to(x.device)  # Add positional encoding

        # Pass through Transformer encoder
        x = self.transformer_encoder(x)

        # Pool over sequence (mean pooling) and pass through final layer
        return self.fc_out(x.mean(dim=1))  # (batch_size, output_dim)

# Define Training Configuration
input_dim = 1000  # Vocabulary size
emb_dim = 256  # Embedding dimension
n_heads = 8  # Number of attention heads
n_layers = 6  # Number of Transformer layers
output_dim = 10  # Number of classes
max_seq_len = 500  # Maximum sequence length

# Instantiate model, optimizer, and loss function
model = Transformer(input_dim, emb_dim, n_heads, n_layers, output_dim, max_seq_len).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Example DataLoader
from torch.utils.data import DataLoader, TensorDataset

# Generate dummy dataset (batch_size=32, sequence_length=50, and random target classes)
batch_size = 32
sequence_length = 50
num_samples = 1000

# Random input and target tensors
inputs = torch.randint(0, input_dim, (num_samples, sequence_length))  # (num_samples, seq_len)
targets = torch.randint(0, output_dim, (num_samples,))  # (num_samples)

# DataLoader for batching
dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training Loop
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")