In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import requests
import time
import math

# Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)

# Dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Transformer Model class
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, nhead, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.transformer = nn.Transformer(hidden_dim, nhead, num_layers, num_layers, dim_feedforward=hidden_dim*4, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(hidden_dim)
        x = x.permute(1, 0, 2)  # Transformer expects seq_len, batch, input_dim
        x = self.transformer(x, x)
        x = x.permute(1, 0, 2)  # Back to batch, seq_len, input_dim
        x = self.fc_out(x[:, -1, :])
        return x

# Function to train and validate the model
def train_and_validate(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=5):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for sequences, targets in train_loader:
            sequences, targets = sequences.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            total_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train_correct += (predicted == targets).sum().item()
            total_train_samples += targets.size(0)

        scheduler.step()  # Learning rate scheduler step
        train_accuracy = 100 * total_train_correct / total_train_samples

        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0
        with torch.no_grad():
            for sequences, targets in test_loader:
                sequences, targets = sequences.to(device), targets.to(device)
                outputs = model(sequences)
                loss = criterion(outputs, targets)
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val_correct += (predicted == targets).sum().item()
                total_val_samples += targets.size(0)

        val_accuracy = 100 * total_val_correct / total_val_samples
        end_time = time.time()
        training_time = end_time - start_time
        print(f'Total Training Time: {training_time:.2f} seconds')
        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, '
              f'Train Accuracy: {train_accuracy:.2f}%, '
              f'Val Loss: {total_val_loss / len(test_loader)}, '
              f'Val Accuracy: {val_accuracy:.2f}%')

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        sequence = [char_to_ix.get(c, 0) for c in initial_str[-max_length:]]
        sequence = torch.tensor(sequence, dtype=torch.long).unsqueeze(0).to(device)
        prediction = model(sequence)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Training configurations
sequence_lengths = [20, 30, 50]
hidden_dims = [64, 128]
num_layers_list = [1, 2]
nheads = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

for sequence_length in sequence_lengths:
    sequences, targets = [], []
    for i in range(0, len(text) - sequence_length):
        seq = [char_to_int[ch] for ch in text[i:i+sequence_length]]
        target = char_to_int[text[i+sequence_length]]
        sequences.append(seq)
        targets.append(target)

    sequences = torch.tensor(sequences, dtype=torch.long).to(device)
    targets = torch.tensor(targets, dtype=torch.long).to(device)

    dataset = CharDataset(sequences, targets)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    for hidden_dim in hidden_dims:
        for num_layers in num_layers_list:
            print(f"Training with sequence_length={sequence_length}, hidden_dim={hidden_dim}, num_layers={num_layers}, nheads={nheads}")
            model = TransformerModel(vocab_size, hidden_dim, num_layers, nheads).to(device)
            model_complexity = sum(p.numel() for p in model.parameters() if p.requires_grad)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)  # Adjust learning rate
            train_and_validate(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=5)
            print(f'Model Complexity (Number of Trainable Parameters): {model_complexity}')
            # Prediction example after training
            test_str = "We are accounted poor citiz"
            predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str, sequence_length)
            print(f"Predicted next character for sequence length {sequence_length}: '{predicted_char}'")

Using device: cuda
Training with sequence_length=20, hidden_dim=64, num_layers=1, nheads=8




Total Training Time: 341.65 seconds
Epoch 1, Train Loss: 2.5167428719128813, Train Accuracy: 26.32%, Val Loss: 2.464784526989095, Val Accuracy: 26.80%
Total Training Time: 714.30 seconds
Epoch 2, Train Loss: 2.478046365462701, Train Accuracy: 26.87%, Val Loss: 2.480775039222004, Val Accuracy: 27.05%
Total Training Time: 1080.67 seconds
Epoch 3, Train Loss: 2.4789787109939105, Train Accuracy: 26.87%, Val Loss: 2.4566844531467984, Val Accuracy: 27.11%
Total Training Time: 2623.89 seconds
Epoch 4, Train Loss: 2.4702772465302076, Train Accuracy: 26.97%, Val Loss: 2.445901823399468, Val Accuracy: 27.46%
Total Training Time: 2961.44 seconds
Epoch 5, Train Loss: 2.461349281065093, Train Accuracy: 27.15%, Val Loss: 2.4327595496546994, Val Accuracy: 27.65%
Model Complexity (Number of Trainable Parameters): 125377
Predicted next character for sequence length 20: 'e'
Training with sequence_length=20, hidden_dim=64, num_layers=2, nheads=8




Total Training Time: 702.10 seconds
Epoch 1, Train Loss: 3.321979800107762, Train Accuracy: 15.22%, Val Loss: 3.3169700111798424, Val Accuracy: 15.26%
Total Training Time: 1405.53 seconds
Epoch 2, Train Loss: 3.3172908794654217, Train Accuracy: 15.23%, Val Loss: 3.315158271680122, Val Accuracy: 15.26%
Total Training Time: 12278.03 seconds
Epoch 3, Train Loss: 3.3164612290097324, Train Accuracy: 15.23%, Val Loss: 3.3149538918329395, Val Accuracy: 15.26%
Total Training Time: 13104.38 seconds
Epoch 4, Train Loss: 3.316258375971603, Train Accuracy: 15.23%, Val Loss: 3.31613293312366, Val Accuracy: 15.26%
Total Training Time: 17933.71 seconds
Epoch 5, Train Loss: 3.316087072701271, Train Accuracy: 15.23%, Val Loss: 3.3140558304625274, Val Accuracy: 15.26%
Model Complexity (Number of Trainable Parameters): 242113
Predicted next character for sequence length 20: ' '
Training with sequence_length=20, hidden_dim=128, num_layers=1, nheads=8
Total Training Time: 247.98 seconds
Epoch 1, Train Loss