In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [2]:
class Net(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, num_layers_hidden1: int, hidden_dim1: int, num_layers_hidden2: int, hidden_dim2: int, dropout_rate:float) -> None:
        super(Net, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, hidden_dim1))
        layers.append(nn.BatchNorm1d(hidden_dim1))
        layers.append(nn.ReLU())

        for _ in range(num_layers_hidden1 - 1):
            layers.append(nn.Linear(hidden_dim1, hidden_dim1))
            layers.append(nn.BatchNorm1d(hidden_dim1))
            layers.append(nn.ReLU())
        
        layers.append(nn.Linear(hidden_dim1, hidden_dim2))  # Ensure transition
        layers.append(nn.BatchNorm1d(hidden_dim2))
        layers.append(nn.ReLU())

        for _ in range(num_layers_hidden2 - 1):  # Add more layers dynamically
            layers.append(nn.Linear(hidden_dim2, hidden_dim2))
            layers.append(nn.BatchNorm1d(hidden_dim2))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))

        layers.append(nn.Linear(hidden_dim2, output_dim))  # Final output layer
        self.network = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.reshape(-1, 28 * 28)
        return self.network(x)

In [3]:
# Load dataset
transform = transforms.ToTensor()


In [12]:

def objective(trial):
    # Optimize number of hidden layers (1 to 5)
    num_layers_hidden1 = trial.suggest_int("num_layers_hidden1", 0, 5)
    num_layers_hidden2 = trial.suggest_int("num_layers_hidden2", 0, 5)
    hidden_dim1 = trial.suggest_int("hidden_dim1", 32, 64)
    hidden_dim2 = trial.suggest_int("hidden_dim2", 128, 256)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.8)
    epochs = trial.suggest_int("epochs", 10, 100, step=10)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    optimizer = trial.suggest_categorical("optimizer", ["Adam", "SGD"])
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3)
    
    train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Create the model
    model = Net(input_dim=28*28, output_dim=10, num_layers_hidden1=num_layers_hidden1, hidden_dim1=hidden_dim1,
                num_layers_hidden2=num_layers_hidden2, hidden_dim2=hidden_dim2, dropout_rate=dropout_rate)
    criterion = nn.CrossEntropyLoss()
    
    if optimizer == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Training loop (1 epoch)
    model.train()
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if batch_idx >= epochs:  # Limit iterations for speed
            break

    return total_loss / len(train_loader)


In [15]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500)

[I 2025-03-27 11:20:12,502] A new study created in memory with name: no-name-ed07a21c-a4a9-4d83-b0b8-08de35aeb8e2
[I 2025-03-27 11:20:13,906] Trial 0 finished with value: 0.05398325472752423 and parameters: {'num_layers_hidden1': 3, 'num_layers_hidden2': 5, 'hidden_dim1': 36, 'hidden_dim2': 138, 'lr': 0.006177528411071901, 'dropout_rate': 0.4195015303493391, 'epochs': 20, 'batch_size': 64, 'optimizer': 'SGD', 'weight_decay': 0.000768823725855512}. Best is trial 0 with value: 0.05398325472752423.
[I 2025-03-27 11:20:19,570] Trial 1 finished with value: 0.635339799333126 and parameters: {'num_layers_hidden1': 2, 'num_layers_hidden2': 2, 'hidden_dim1': 49, 'hidden_dim2': 181, 'lr': 0.0060464901273785815, 'dropout_rate': 0.6718242895544551, 'epochs': 80, 'batch_size': 256, 'optimizer': 'SGD', 'weight_decay': 0.0006245907736908421}. Best is trial 0 with value: 0.05398325472752423.
[I 2025-03-27 11:20:23,453] Trial 2 finished with value: 0.13907874698069558 and parameters: {'num_layers_hidde

In [16]:
# Print the best hyperparameters
print("Best number of hidden layers in group 1:", study.best_params["num_layers_hidden1"])
print("Best number of hidden layers in group 2:", study.best_params["num_layers_hidden2"])
print("Best hidden dimensions: hidden_dim1 =", study.best_params["hidden_dim1"], 
"hidden_dim2 =", study.best_params["hidden_dim2"])
print("Best learning rate:", study.best_params["lr"])
print("Best dropout rate:", study.best_params["dropout_rate"])
print("Best number of epochs:", study.best_params["epochs"])
print("Best batch size:", study.best_params["batch_size"])
print("Best optimizer:", study.best_params["optimizer"])
print("Best weight decay:", study.best_params["weight_decay"])
print("Best loss:", study.best_value)

Best number of hidden layers in group 1: 0
Best number of hidden layers in group 2: 0
Best hidden dimensions: hidden_dim1 = 63 hidden_dim2 = 240
Best learning rate: 0.008986837688628745
Best dropout rate: 0.5742811446212042
Best number of epochs: 10
Best batch size: 32
Best optimizer: Adam
Best weight decay: 0.000844057424084093
Best loss: 0.0065862980524698895
