In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Dummy dataset class
class MyDataset(Dataset):
    def __init__(self, num_samples, max_length=10, vocab_size=100):
        self.num_samples = num_samples
        self.max_length = max_length
        self.vocab_size = vocab_size

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        src = torch.randint(1, self.vocab_size, (self.max_length,))
        tgt = torch.randint(1, self.vocab_size, (self.max_length,))
        return src, tgt

# Dummy model
class Transformer(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
        super(Transformer, self).__init__()

        self.embedding_encoder = nn.Embedding(input_vocab_size, d_model)
        self.embedding_decoder = nn.Embedding(output_vocab_size, d_model)
        self.pos_encoder = nn.Embedding(100, d_model)

        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)

        self.fc = nn.Linear(d_model, output_vocab_size)

    def forward(self, src, tgt):
        src = self.embedding_encoder(src)
        src = src + self.pos_encoder(torch.arange(src.size(1), device=src.device).unsqueeze(0))
        tgt = self.embedding_decoder(tgt)
        tgt = tgt + self.pos_encoder(torch.arange(tgt.size(1), device=tgt.device).unsqueeze(0))

        output = self.transformer(src.transpose(0, 1), tgt.transpose(0, 1))

        output = self.fc(output)

        return output.transpose(0, 1)

# Parameters
num_samples = 1000
max_length = 10
vocab_size = 100
batch_size = 32
num_epochs = 10

# Create dataset and dataloader
dataset = MyDataset(num_samples, max_length, vocab_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model initialization
model = Transformer(vocab_size, vocab_size, d_model=128, nhead=4, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=256, dropout=0.1)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (src_data, tgt_data) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(src_data, tgt_data[:, :-1])  # Target does not include the last token
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_data[:, 1:].reshape(-1))  # Shift target to exclude the start token
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 4.6558
Epoch [2/10], Loss: 4.6109
Epoch [3/10], Loss: 3.4258
Epoch [4/10], Loss: 1.1321
Epoch [5/10], Loss: 0.7108
Epoch [6/10], Loss: 0.5859
Epoch [7/10], Loss: 0.5503
Epoch [8/10], Loss: 0.5461
Epoch [9/10], Loss: 0.5745
Epoch [10/10], Loss: 0.5643
