In [1]:
import torch
import torchvision
import torchvision.transforms.v2 as T

toTensor = T.Compose([T.ToImage(), T.ToDtype(torch.float32, scale=True)])
train_and_valid_set = torchvision.datasets.CIFAR10(
    root="datasets", train=True, download=True, transform=toTensor)
test_set = torchvision.datasets.CIFAR10(
    root="datasets", train=False, download=True, transform=toTensor)

100%|██████████| 170M/170M [00:03<00:00, 53.8MB/s]


In [2]:
torch.manual_seed(42)
train_set, valid_set = torch.utils.data.random_split(
    train_and_valid_set, [45_000, 5_000]
)

In [3]:
from torch.utils.data import DataLoader

batch_size = 128
train_loader = DataLoader(train_set, batch_size = batch_size, shuffle=True )
valid_loader = DataLoader(valid_set, batch_size = batch_size, shuffle=False )
test_loader = DataLoader(test_set, batch_size = batch_size, shuffle=False )

In [4]:
import torch.nn as nn

def use_he_init(module):
  if isinstance(module, nn.Linear):
    nn.init.kaiming_uniform_(module.weight)
    nn.init.zeros_(module.bias)

In [5]:
def deep_model(n_hidden, n_neurons, n_inputs, n_outputs):

  layers = [nn.Flatten(), nn.Linear(n_inputs, n_neurons), nn.SiLU() ]

  for _ in range(n_hidden - 1):
    layers += [nn.Linear(n_neurons, n_neurons), nn.SiLU()]

  layers += [nn.Linear(n_neurons, n_outputs)]
  model = torch.nn.Sequential(*layers)
  model.apply(use_he_init)
  return model

# Creating deep neural network using function

In [6]:
torch.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

model = deep_model(
    n_hidden = 20, n_neurons = 100, n_inputs = 3 * 32 * 32, n_outputs = 10
).to(device)

In [7]:
def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

# Evaluation Function

In [8]:
!pip install optuna
!pip install torchmetrics

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0
Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected

In [9]:
import time
import optuna

def train_with_early_stopping(model, optimizer, loss_fn, metric, train_loader, valid_loader,
                              n_epochs, device, patience=10, checkpoint_path=None, scheduler=None, trial=None):

  checkpoint_path = checkpoint_path or "my_checkpoint.pt"
  history = {"train_losses": [], "train_metrics":[], "valid_metrics":[]}
  best_metric = 0.0
  patience_counter = 0
  file_saved = False

  for epoch in range(n_epochs):

    total_loss = 0.0
    metric.reset()
    model.train()
    t0 = time.time()

    for x_batch, y_batch in train_loader:
      x_batch, y_batch = x_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      y_pred = model(x_batch)
      loss = loss_fn(y_pred, y_batch)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
      metric.update(y_pred, y_batch)

    train_metric = metric.compute()
    if isinstance(train_metric, torch.Tensor):
        train_metric = train_metric.item()

    valid_metric = evaluate_tm(model, valid_loader, metric)

    if trial is not None:
        trial.report(valid_metric, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    if valid_metric > best_metric:
      best_metric = valid_metric
      best = " (best)"
      patience_counter = 0

      if trial is None:
        torch.save(model.state_dict(), checkpoint_path)
        file_saved = True
    else:
      patience_counter += 1
      best = ""

    t1 = time.time()
    history["train_losses"].append(total_loss / len(train_loader))
    history["train_metrics"].append(train_metric)
    history["valid_metrics"].append(valid_metric)

    print(f"Epoch {epoch + 1}/{n_epochs}, "
          f"train loss: {history['train_losses'][-1]:.4f}, "
          f"train metric: {history['train_metrics'][-1]:.4f}, "
          f"valid metric: {history['valid_metrics'][-1]:.4f}{best}"
          f" in {t1 - t0:.1f}s"
    )

    if scheduler is not None:
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler.step(valid_metric)
        else:
            scheduler.step()

    if patience_counter >= patience:
        print("Early stopping!")
        break

  if trial is None and file_saved:
      model.load_state_dict(torch.load(checkpoint_path))

  return best_metric


In [10]:
import torch.optim as optim
import torchmetrics

def objective(trial):

    # Hyperparams
    n_hidden = trial.suggest_int("n_hidden", 1, 5)
    n_neurons = trial.suggest_int("n_neurons", 256, 1024)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["AdamW", "NAdam"])

    model = deep_model(n_hidden, n_neurons, 3072, 10).to(device)

    optimizer_class = getattr(optim, optimizer_name)
    optimizer = optimizer_class(model.parameters(), lr=lr)

    criterion = nn.CrossEntropyLoss()
    metric = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)

    best_accuracy = train_with_early_stopping(
        model=model,
        optimizer=optimizer,
        loss_fn=criterion,
        metric=metric,
        train_loader=train_loader,
        valid_loader=valid_loader,
        n_epochs=10,
        patience=3,
        device=device,
        trial=trial
    )

    return best_accuracy

In [11]:
import optuna

pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)

study = optuna.create_study(direction="maximize", pruner=pruner)

print("--- Starting Hyperparameter Search ---")

study.optimize(objective, n_trials=20)

print("\n --- Optimization Complete ---")
print(f"Best Accuracy Found: {study.best_trial.value:.4f}")
print("Best Hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

[I 2026-02-04 06:25:58,241] A new study created in memory with name: no-name-ff176aae-809b-43e2-ac77-c2339f796369


--- Starting Hyperparameter Search ---
Epoch 1/10, train loss: 1.8636, train metric: 0.3432, valid metric: 0.3916 (best) in 28.7s
Epoch 2/10, train loss: 1.6471, train metric: 0.4132, valid metric: 0.4018 (best) in 24.0s
Epoch 3/10, train loss: 1.5687, train metric: 0.4440, valid metric: 0.4732 (best) in 24.7s
Epoch 4/10, train loss: 1.4972, train metric: 0.4742, valid metric: 0.4742 (best) in 24.3s
Epoch 5/10, train loss: 1.4525, train metric: 0.4834, valid metric: 0.4836 (best) in 24.3s
Epoch 6/10, train loss: 1.4147, train metric: 0.5009, valid metric: 0.4894 (best) in 24.3s
Epoch 7/10, train loss: 1.3846, train metric: 0.5106, valid metric: 0.5006 (best) in 24.2s
Epoch 8/10, train loss: 1.3487, train metric: 0.5235, valid metric: 0.4978 in 24.6s
Epoch 9/10, train loss: 1.3339, train metric: 0.5296, valid metric: 0.4942 in 24.3s


[I 2026-02-04 06:30:05,906] Trial 0 finished with value: 0.5148000121116638 and parameters: {'n_hidden': 1, 'n_neurons': 759, 'lr': 0.000474962015711461, 'optimizer': 'AdamW'}. Best is trial 0 with value: 0.5148000121116638.


Epoch 10/10, train loss: 1.3059, train metric: 0.5394, valid metric: 0.5148 (best) in 24.1s
Epoch 1/10, train loss: 2.7742, train metric: 0.2523, valid metric: 0.2860 (best) in 24.3s
Epoch 2/10, train loss: 2.6469, train metric: 0.2496, valid metric: 0.1932 in 26.2s
Epoch 3/10, train loss: 1.9874, train metric: 0.2792, valid metric: 0.2976 (best) in 27.5s
Epoch 4/10, train loss: 1.8154, train metric: 0.3436, valid metric: 0.3608 (best) in 27.8s
Epoch 5/10, train loss: 1.7595, train metric: 0.3641, valid metric: 0.3620 (best) in 28.2s
Epoch 6/10, train loss: 1.7385, train metric: 0.3707, valid metric: 0.3846 (best) in 29.6s
Epoch 7/10, train loss: 1.6966, train metric: 0.3853, valid metric: 0.3892 (best) in 29.8s
Epoch 8/10, train loss: 1.6613, train metric: 0.3973, valid metric: 0.3728 in 32.3s
Epoch 9/10, train loss: 1.6472, train metric: 0.4023, valid metric: 0.3940 (best) in 32.9s


[I 2026-02-04 06:34:58,530] Trial 1 finished with value: 0.4050000011920929 and parameters: {'n_hidden': 4, 'n_neurons': 458, 'lr': 0.003595689013615736, 'optimizer': 'NAdam'}. Best is trial 0 with value: 0.5148000121116638.


Epoch 10/10, train loss: 1.6356, train metric: 0.4086, valid metric: 0.4050 (best) in 34.0s
Epoch 1/10, train loss: 1.8898, train metric: 0.3278, valid metric: 0.3536 (best) in 19.4s
Epoch 2/10, train loss: 1.6662, train metric: 0.4059, valid metric: 0.4142 (best) in 18.7s
Epoch 3/10, train loss: 1.5686, train metric: 0.4438, valid metric: 0.4244 (best) in 19.8s
Epoch 4/10, train loss: 1.4981, train metric: 0.4714, valid metric: 0.4614 (best) in 18.8s
Epoch 5/10, train loss: 1.4483, train metric: 0.4878, valid metric: 0.4736 (best) in 19.7s
Epoch 6/10, train loss: 1.4043, train metric: 0.5054, valid metric: 0.4704 in 19.2s
Epoch 7/10, train loss: 1.3580, train metric: 0.5226, valid metric: 0.4838 (best) in 19.5s
Epoch 8/10, train loss: 1.3247, train metric: 0.5332, valid metric: 0.4452 in 19.8s
Epoch 9/10, train loss: 1.2925, train metric: 0.5433, valid metric: 0.4406 in 18.7s


[I 2026-02-04 06:38:11,711] Trial 2 finished with value: 0.510200023651123 and parameters: {'n_hidden': 1, 'n_neurons': 415, 'lr': 0.0006266794874801956, 'optimizer': 'NAdam'}. Best is trial 0 with value: 0.5148000121116638.


Epoch 10/10, train loss: 1.2568, train metric: 0.5553, valid metric: 0.5102 (best) in 19.7s
Epoch 1/10, train loss: 1.8716, train metric: 0.3352, valid metric: 0.3682 (best) in 23.9s
Epoch 2/10, train loss: 1.6672, train metric: 0.4116, valid metric: 0.4348 (best) in 24.1s
Epoch 3/10, train loss: 1.5776, train metric: 0.4457, valid metric: 0.4440 (best) in 24.1s
Epoch 4/10, train loss: 1.5156, train metric: 0.4708, valid metric: 0.4462 (best) in 23.7s
Epoch 5/10, train loss: 1.4664, train metric: 0.4839, valid metric: 0.4712 (best) in 23.9s
Epoch 6/10, train loss: 1.4171, train metric: 0.5038, valid metric: 0.4852 (best) in 24.0s
Epoch 7/10, train loss: 1.3782, train metric: 0.5202, valid metric: 0.4676 in 24.1s
Epoch 8/10, train loss: 1.3408, train metric: 0.5318, valid metric: 0.5074 (best) in 24.2s
Epoch 9/10, train loss: 1.3057, train metric: 0.5458, valid metric: 0.5042 in 23.8s


[I 2026-02-04 06:42:11,432] Trial 3 finished with value: 0.5073999762535095 and parameters: {'n_hidden': 1, 'n_neurons': 724, 'lr': 0.00019650434701492584, 'optimizer': 'NAdam'}. Best is trial 0 with value: 0.5148000121116638.


Epoch 10/10, train loss: 1.2782, train metric: 0.5541, valid metric: 0.4686 in 23.9s
Epoch 1/10, train loss: 1.8997, train metric: 0.3253, valid metric: 0.3654 (best) in 21.7s
Epoch 2/10, train loss: 1.7073, train metric: 0.3946, valid metric: 0.4114 (best) in 21.0s
Epoch 3/10, train loss: 1.6237, train metric: 0.4267, valid metric: 0.4228 (best) in 21.7s
Epoch 4/10, train loss: 1.5754, train metric: 0.4439, valid metric: 0.4424 (best) in 21.9s
Epoch 5/10, train loss: 1.5395, train metric: 0.4577, valid metric: 0.4396 in 22.3s
Epoch 6/10, train loss: 1.5016, train metric: 0.4716, valid metric: 0.4668 (best) in 22.1s
Epoch 7/10, train loss: 1.4738, train metric: 0.4832, valid metric: 0.4654 in 22.0s
Epoch 8/10, train loss: 1.4499, train metric: 0.4903, valid metric: 0.4692 (best) in 22.7s
Epoch 9/10, train loss: 1.4216, train metric: 0.4996, valid metric: 0.4804 (best) in 22.7s


[I 2026-02-04 06:45:52,253] Trial 4 finished with value: 0.4828000068664551 and parameters: {'n_hidden': 4, 'n_neurons': 381, 'lr': 3.894889966929681e-05, 'optimizer': 'AdamW'}. Best is trial 0 with value: 0.5148000121116638.


Epoch 10/10, train loss: 1.3972, train metric: 0.5092, valid metric: 0.4828 (best) in 22.6s
Epoch 1/10, train loss: 1.9779, train metric: 0.3173, valid metric: 0.3656 (best) in 19.1s
Epoch 2/10, train loss: 1.7109, train metric: 0.3956, valid metric: 0.4106 (best) in 20.1s


[I 2026-02-04 06:46:51,372] Trial 5 pruned. 


Epoch 1/10, train loss: 1.8714, train metric: 0.3393, valid metric: 0.3888 (best) in 19.0s
Epoch 2/10, train loss: 1.6913, train metric: 0.4054, valid metric: 0.4020 (best) in 19.4s
Epoch 3/10, train loss: 1.6141, train metric: 0.4360, valid metric: 0.4412 (best) in 18.5s
Epoch 4/10, train loss: 1.5588, train metric: 0.4520, valid metric: 0.4574 (best) in 19.4s


[I 2026-02-04 06:48:26,876] Trial 6 pruned. 


Epoch 1/10, train loss: 1.9010, train metric: 0.3270, valid metric: 0.3456 (best) in 23.6s
Epoch 2/10, train loss: 1.7496, train metric: 0.3901, valid metric: 0.3916 (best) in 23.7s


[I 2026-02-04 06:49:37,764] Trial 7 pruned. 


Epoch 1/10, train loss: 1.8580, train metric: 0.3437, valid metric: 0.3962 (best) in 22.4s
Epoch 2/10, train loss: 1.6506, train metric: 0.4186, valid metric: 0.4252 (best) in 22.2s
Epoch 3/10, train loss: 1.5733, train metric: 0.4441, valid metric: 0.4466 (best) in 21.8s
Epoch 4/10, train loss: 1.5130, train metric: 0.4706, valid metric: 0.4682 (best) in 22.3s


[I 2026-02-04 06:51:28,564] Trial 8 pruned. 


Epoch 1/10, train loss: 1.9364, train metric: 0.3290, valid metric: 0.3606 (best) in 21.3s
Epoch 2/10, train loss: 1.6541, train metric: 0.4097, valid metric: 0.4168 (best) in 21.2s
Epoch 3/10, train loss: 1.5639, train metric: 0.4404, valid metric: 0.4436 (best) in 21.5s
Epoch 4/10, train loss: 1.5100, train metric: 0.4617, valid metric: 0.4630 (best) in 21.4s


[I 2026-02-04 06:53:14,773] Trial 9 pruned. 


Epoch 1/10, train loss: 1.9186, train metric: 0.3229, valid metric: 0.3842 (best) in 55.1s
Epoch 2/10, train loss: 1.7220, train metric: 0.3981, valid metric: 0.4160 (best) in 54.6s


[I 2026-02-04 06:55:59,930] Trial 10 pruned. 


Epoch 1/10, train loss: 1.9112, train metric: 0.3187, valid metric: 0.3626 (best) in 31.8s
Epoch 2/10, train loss: 1.6528, train metric: 0.4065, valid metric: 0.3980 (best) in 32.6s
Epoch 3/10, train loss: 1.5221, train metric: 0.4540, valid metric: 0.4610 (best) in 32.2s
Epoch 4/10, train loss: 1.4345, train metric: 0.4870, valid metric: 0.4350 in 32.2s
Epoch 5/10, train loss: 1.3592, train metric: 0.5136, valid metric: 0.4952 (best) in 32.3s
Epoch 6/10, train loss: 1.2983, train metric: 0.5360, valid metric: 0.4638 in 31.6s
Epoch 7/10, train loss: 1.2391, train metric: 0.5585, valid metric: 0.5190 (best) in 32.4s
Epoch 8/10, train loss: 1.1722, train metric: 0.5799, valid metric: 0.5090 in 31.8s
Epoch 9/10, train loss: 1.1474, train metric: 0.5965, valid metric: 0.2166 in 32.4s


[I 2026-02-04 07:01:22,036] Trial 11 finished with value: 0.5189999938011169 and parameters: {'n_hidden': 2, 'n_neurons': 885, 'lr': 0.000614336171320846, 'optimizer': 'NAdam'}. Best is trial 11 with value: 0.5189999938011169.


Epoch 10/10, train loss: 1.1850, train metric: 0.5800, valid metric: 0.5172 in 32.7s
Early stopping!
Epoch 1/10, train loss: 13.1068, train metric: 0.1097, valid metric: 0.1008 (best) in 32.8s
Epoch 2/10, train loss: 206.1664, train metric: 0.1073, valid metric: 0.1166 (best) in 50.2s


[I 2026-02-04 07:03:36,883] Trial 12 pruned. 


Epoch 1/10, train loss: 2.0291, train metric: 0.2960, valid metric: 0.2950 (best) in 36.0s
Epoch 2/10, train loss: 1.9450, train metric: 0.3570, valid metric: 0.3842 (best) in 36.4s


[I 2026-02-04 07:05:26,302] Trial 13 pruned. 


Epoch 1/10, train loss: 1.8484, train metric: 0.3354, valid metric: 0.3216 (best) in 34.8s
Epoch 2/10, train loss: 1.6151, train metric: 0.4226, valid metric: 0.4332 (best) in 35.3s
Epoch 3/10, train loss: 1.4798, train metric: 0.4711, valid metric: 0.4388 (best) in 35.1s
Epoch 4/10, train loss: 1.3869, train metric: 0.5023, valid metric: 0.4926 (best) in 35.4s
Epoch 5/10, train loss: 1.3098, train metric: 0.5320, valid metric: 0.4782 in 34.8s
Epoch 6/10, train loss: 1.2355, train metric: 0.5582, valid metric: 0.5208 (best) in 34.9s
Epoch 7/10, train loss: 1.1577, train metric: 0.5837, valid metric: 0.5004 in 35.1s
Epoch 8/10, train loss: 1.0850, train metric: 0.6111, valid metric: 0.5024 in 34.7s


[I 2026-02-04 07:10:40,916] Trial 14 finished with value: 0.520799994468689 and parameters: {'n_hidden': 3, 'n_neurons': 804, 'lr': 0.00030369011661163464, 'optimizer': 'NAdam'}. Best is trial 14 with value: 0.520799994468689.


Epoch 9/10, train loss: 1.0096, train metric: 0.6415, valid metric: 0.5178 in 34.6s
Early stopping!
Epoch 1/10, train loss: 1.8370, train metric: 0.3446, valid metric: 0.4050 (best) in 26.6s
Epoch 2/10, train loss: 1.6140, train metric: 0.4266, valid metric: 0.4186 (best) in 26.6s


[I 2026-02-04 07:12:01,093] Trial 15 pruned. 


Epoch 1/10, train loss: 9.7727, train metric: 0.2250, valid metric: 0.2704 (best) in 43.8s
Epoch 2/10, train loss: 1.8060, train metric: 0.3476, valid metric: 0.3848 (best) in 42.6s


[I 2026-02-04 07:14:11,308] Trial 16 pruned. 


Epoch 1/10, train loss: 1.8181, train metric: 0.3517, valid metric: 0.3990 (best) in 41.2s
Epoch 2/10, train loss: 1.6039, train metric: 0.4313, valid metric: 0.4022 (best) in 41.1s


[I 2026-02-04 07:16:15,152] Trial 17 pruned. 


Epoch 1/10, train loss: 1.8545, train metric: 0.3406, valid metric: 0.3736 (best) in 32.8s
Epoch 2/10, train loss: 1.6144, train metric: 0.4286, valid metric: 0.4196 (best) in 33.3s
Epoch 3/10, train loss: 1.5061, train metric: 0.4647, valid metric: 0.4414 (best) in 33.2s
Epoch 4/10, train loss: 1.4156, train metric: 0.4954, valid metric: 0.5008 (best) in 33.6s
Epoch 5/10, train loss: 1.3453, train metric: 0.5211, valid metric: 0.4898 in 33.5s
Epoch 6/10, train loss: 1.2862, train metric: 0.5426, valid metric: 0.5192 (best) in 33.5s
Epoch 7/10, train loss: 1.2230, train metric: 0.5655, valid metric: 0.5084 in 32.6s
Epoch 8/10, train loss: 1.1704, train metric: 0.5853, valid metric: 0.5046 in 33.1s
Epoch 9/10, train loss: 1.1156, train metric: 0.6051, valid metric: 0.5310 (best) in 32.9s


[I 2026-02-04 07:21:46,596] Trial 18 finished with value: 0.531000018119812 and parameters: {'n_hidden': 2, 'n_neurons': 914, 'lr': 0.0002577197496050418, 'optimizer': 'NAdam'}. Best is trial 18 with value: 0.531000018119812.


Epoch 10/10, train loss: 1.0569, train metric: 0.6272, valid metric: 0.5262 in 32.8s
Epoch 1/10, train loss: 1.8839, train metric: 0.3320, valid metric: 0.3686 (best) in 43.1s
Epoch 2/10, train loss: 1.6828, train metric: 0.4088, valid metric: 0.4166 (best) in 42.2s
Epoch 3/10, train loss: 1.5991, train metric: 0.4400, valid metric: 0.4420 (best) in 42.0s


[I 2026-02-04 07:24:36,740] Trial 19 pruned. 



 --- Optimization Complete ---
Best Accuracy Found: 0.5310
Best Hyperparameters:
  n_hidden: 2
  n_neurons: 914
  lr: 0.0002577197496050418
  optimizer: NAdam


In [12]:
params = study.best_params
params

{'n_hidden': 2,
 'n_neurons': 914,
 'lr': 0.0002577197496050418,
 'optimizer': 'NAdam'}

In [18]:
# Reusing the best model hyperparameter to train more epochs

final_model = deep_model(
    n_hidden=params["n_hidden"],
    n_neurons=params["n_neurons"],
    n_inputs=3072,
    n_outputs=10
).to(device)

optimizer_class = getattr(optim, params["optimizer"])
final_optimizer = optimizer_class(
    final_model.parameters(),
    lr=params["lr"],
    weight_decay=params.get("weight_decay", 1e-4)
)

criterion = nn.CrossEntropyLoss()

metric = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    final_optimizer, mode='max', factor=0.1, patience=5
)

print("\n--- Starting Training (50 Epochs) ---")

final_history = train_with_early_stopping(
    model=final_model,
    optimizer=final_optimizer,
    loss_fn=criterion,
    metric=metric,
    train_loader=train_loader,
    valid_loader=valid_loader,
    n_epochs=50,
    patience=10,
    device=device,
    scheduler=scheduler,
    checkpoint_path="best_cifar_mlp.pth",
    trial=None
)

print("Full training complete. Best model saved to 'best_cifar_mlp.pth'")


--- Starting Training (50 Epochs) ---
Epoch 1/50, train loss: 1.8524, train metric: 0.3389, valid metric: 0.3842 (best) in 34.8s
Epoch 2/50, train loss: 1.6251, train metric: 0.4241, valid metric: 0.4172 (best) in 34.5s
Epoch 3/50, train loss: 1.5140, train metric: 0.4626, valid metric: 0.4634 (best) in 33.9s
Epoch 4/50, train loss: 1.4256, train metric: 0.4944, valid metric: 0.4622 in 34.5s
Epoch 5/50, train loss: 1.3563, train metric: 0.5198, valid metric: 0.4838 (best) in 35.0s
Epoch 6/50, train loss: 1.2930, train metric: 0.5409, valid metric: 0.4554 in 34.6s
Epoch 7/50, train loss: 1.2387, train metric: 0.5601, valid metric: 0.5266 (best) in 33.7s
Epoch 8/50, train loss: 1.1838, train metric: 0.5815, valid metric: 0.5262 in 34.4s
Epoch 9/50, train loss: 1.1371, train metric: 0.5963, valid metric: 0.5050 in 34.5s
Epoch 10/50, train loss: 1.0864, train metric: 0.6158, valid metric: 0.5058 in 34.8s
Epoch 11/50, train loss: 1.0392, train metric: 0.6318, valid metric: 0.5276 (best) in

In [19]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2470, 0.2435, 0.2616]
    ),
    transforms.Lambda(lambda x: x.view(-1))
])

In [33]:
class DeepMLP_SELU(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_neurons, n_outputs):
        super().__init__()
        layers = []

        in_features = n_inputs
        for _ in range(n_hidden):
            linear = nn.Linear(in_features, n_neurons)
            nn.init.normal_(linear.weight, mean=0.0, std=(1 / in_features) ** 0.5)
            nn.init.zeros_(linear.bias)

            layers.append(linear)
            layers.append(nn.SELU())
            in_features = n_neurons

        out = nn.Linear(in_features, n_outputs)
        nn.init.normal_(out.weight, mean=0.0, std=(1 / in_features) ** 0.5)
        nn.init.zeros_(out.bias)

        layers.append(out)
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.net(x)


In [34]:
from torchvision import transforms

selu_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2470, 0.2435, 0.2616]
    ),
])


In [35]:
train_and_valid_set = torchvision.datasets.CIFAR10(
    root="datasets", train=True, download=True, transform=selu_transform)
test_set = torchvision.datasets.CIFAR10(
    root="datasets", train=False, download=True, transform=selu_transform)

In [36]:
final_model = DeepMLP_SELU(
    n_inputs=3072,
    n_hidden=params["n_hidden"],
    n_neurons=params["n_neurons"],
    n_outputs=10
).to(device)

In [37]:
final_history_selu = train_with_early_stopping(
    model=final_model,
    optimizer=final_optimizer,
    loss_fn=criterion,
    metric=metric,
    train_loader=train_loader,
    valid_loader=valid_loader,
    n_epochs=50,
    patience=10,
    device=device,
    scheduler=scheduler,
    checkpoint_path="best_cifar_mlp_selu.pth",
    trial=None
)

Epoch 1/50, train loss: 2.4382, train metric: 0.0981, valid metric: 0.0998 (best) in 26.8s
Epoch 2/50, train loss: 2.4382, train metric: 0.0981, valid metric: 0.0998 in 26.9s
Epoch 3/50, train loss: 2.4381, train metric: 0.0981, valid metric: 0.0998 in 27.1s
Epoch 4/50, train loss: 2.4381, train metric: 0.0981, valid metric: 0.0998 in 26.4s
Epoch 5/50, train loss: 2.4381, train metric: 0.0981, valid metric: 0.0998 in 26.9s
Epoch 6/50, train loss: 2.4382, train metric: 0.0981, valid metric: 0.0998 in 26.4s
Epoch 7/50, train loss: 2.4382, train metric: 0.0981, valid metric: 0.0998 in 26.5s
Epoch 8/50, train loss: 2.4381, train metric: 0.0981, valid metric: 0.0998 in 26.5s
Epoch 9/50, train loss: 2.4382, train metric: 0.0981, valid metric: 0.0998 in 26.6s
Epoch 10/50, train loss: 2.4381, train metric: 0.0981, valid metric: 0.0998 in 27.6s
Epoch 11/50, train loss: 2.4382, train metric: 0.0981, valid metric: 0.0998 in 26.8s
Early stopping!


In [38]:
def deep_model_dropout(n_hidden, n_neurons, n_inputs, n_outputs, p=0.2):

    layers = [
        nn.Flatten(),
        nn.Linear(n_inputs, n_neurons),
        nn.SiLU(),
        nn.Dropout(p)
    ]

    for _ in range(n_hidden - 1):
        layers += [
            nn.Linear(n_neurons, n_neurons),
            nn.SiLU(),
            nn.Dropout(p)
        ]

    layers += [nn.Linear(n_neurons, n_outputs)]

    model = nn.Sequential(*layers)
    model.apply(use_he_init)
    return model


In [40]:
optimizer_class = getattr(torch.optim, params["optimizer"])

optimizer = optimizer_class(
    model.parameters(),
    lr=params["lr"]
)

In [41]:
params = {
    'n_hidden': 2,
    'n_neurons': 914,
    'lr': 0.0002577197496050418,
    'optimizer': 'NAdam'
}

model = deep_model_dropout(
    n_hidden=params["n_hidden"],
    n_neurons=params["n_neurons"],
    n_inputs=3072,
    n_outputs=10,
    p=0.2
).to(device)

optimizer_class = getattr(torch.optim, params["optimizer"])
optimizer = optimizer_class(
    model.parameters(),
    lr=params["lr"]
)

best_acc = train_with_early_stopping(
    model=model,
    optimizer=optimizer,
    loss_fn=criterion,
    metric=metric,
    train_loader=train_loader,
    valid_loader=valid_loader,
    n_epochs=50,
    patience=10,
    device=device,
    checkpoint_path="best_mlp_dropout.pth"
)


Epoch 1/50, train loss: 1.8931, train metric: 0.3195, valid metric: 0.4062 (best) in 35.2s
Epoch 2/50, train loss: 1.6828, train metric: 0.4000, valid metric: 0.4356 (best) in 35.3s
Epoch 3/50, train loss: 1.5872, train metric: 0.4338, valid metric: 0.3934 in 34.8s
Epoch 4/50, train loss: 1.5175, train metric: 0.4599, valid metric: 0.4300 in 35.3s
Epoch 5/50, train loss: 1.4582, train metric: 0.4778, valid metric: 0.4782 (best) in 34.2s
Epoch 6/50, train loss: 1.4055, train metric: 0.4977, valid metric: 0.4670 in 35.0s
Epoch 7/50, train loss: 1.3629, train metric: 0.5117, valid metric: 0.4838 (best) in 34.9s
Epoch 8/50, train loss: 1.3247, train metric: 0.5260, valid metric: 0.5122 (best) in 35.5s
Epoch 9/50, train loss: 1.2875, train metric: 0.5409, valid metric: 0.5094 in 35.3s
Epoch 10/50, train loss: 1.2569, train metric: 0.5527, valid metric: 0.5256 (best) in 34.6s
Epoch 11/50, train loss: 1.2215, train metric: 0.5652, valid metric: 0.5250 in 35.2s
Epoch 12/50, train loss: 1.1937,

In [42]:
@torch.no_grad()
def standard_accuracy(model, dataloader, device):
    model.eval()
    correct, total = 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        preds = model(x).argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    return correct / total


In [43]:
std_acc = standard_accuracy(model, valid_loader, device)
print(f"Standard accuracy: {std_acc:.4f}")

Standard accuracy: 0.5716


In [44]:
def enable_dropout(model):
    for m in model.modules():
        if isinstance(m, nn.Dropout):
            m.train()


In [45]:
import torch.nn.functional as F

@torch.no_grad()
def mc_dropout_accuracy(model, dataloader, device, n_samples=30):
    model.eval()
    enable_dropout(model)

    probs_mc = []

    for _ in range(n_samples):
        probs = []
        targets = []

        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            probs.append(F.softmax(logits, dim=1))
            targets.append(y)

        probs_mc.append(torch.cat(probs))

    mean_probs = torch.stack(probs_mc).mean(dim=0)
    preds = mean_probs.argmax(dim=1)
    targets = torch.cat(targets)

    return (preds == targets).float().mean().item()


In [46]:
mc_acc = mc_dropout_accuracy(model, valid_loader, device, n_samples=30)

print(f"Standard accuracy : {std_acc:.4f}")
print(f"MC Dropout accuracy: {mc_acc:.4f}")


Standard accuracy : 0.5716
MC Dropout accuracy: 0.5714
