<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Transformers_for_Sequence_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, num_classes, max_seq_len):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, model_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_seq_len, model_dim))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(model_dim, num_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.dropout(x.mean(dim=1))
        return self.fc(x)

def generate_data(num_samples, max_seq_len, vocab_size, num_classes):
    sequences = np.random.randint(1, vocab_size, size=(num_samples, max_seq_len))
    labels = np.random.randint(0, num_classes, size=(num_samples,))
    return sequences, labels

vocab_size = 1000
max_seq_len = 50
num_classes = 10
num_samples = 1000

sequences, labels = generate_data(num_samples, max_seq_len, vocab_size, num_classes)
dataset = TensorDataset(torch.tensor(sequences, dtype=torch.long), torch.tensor(labels, dtype=torch.long))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = TransformerModel(input_dim=vocab_size, model_dim=128, num_heads=4, num_layers=2, num_classes=num_classes, max_seq_len=max_seq_len)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # Smaller learning rate

def train_model(num_epochs):
    for epoch in range(num_epochs):
        for batch in dataloader:
            inputs, targets = batch
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

train_model(num_epochs=10)