In [1]:
from mads_datasets import DatasetFactoryProvider, DatasetType

from mltrainer.preprocessors import BasePreprocessor
from mltrainer import imagemodels, Trainer, TrainerSettings, ReportTypes, metrics

import torch.optim as optim
from torch import nn
from tomlserializer import TOMLSerializer

In [2]:
# Create datastreamers
fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
preprocessor = BasePreprocessor()
streamers = fashionfactory.create_datastreamer(batchsize=64, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]
trainstreamer = train.stream()
validstreamer = valid.stream()

[32m2026-02-21 12:33:21.245[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist[0m
[32m2026-02-21 12:33:21.246[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


In [51]:
len(train)

937

In [3]:
# Model settings
import torch

accuracy = metrics.Accuracy()

loss_fn = torch.nn.CrossEntropyLoss()

settings = TrainerSettings(
    epochs=3,
    metrics=[accuracy],
    logdir="modellogs",
    train_steps=100,
    valid_steps=100,
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.TOML],
)

In [4]:
# Model creating class

class NeuralNetwork(nn.Module):
    def __init__(self, num_classes: int, units1: int, units2: int) -> None:
        super().__init__()
        self.num_classes = num_classes
        self.units1 = units1
        self.units2 = units2
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, units1),
            nn.ReLU(),
            nn.Linear(units1, units2),
            nn.ReLU(),
            nn.Linear(units2, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [5]:
# Creating the model
model = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

In [6]:
# Setting up the trainer

trainer = Trainer(
    model=model,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:33:21.306[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123321[0m
[32m2026-02-21 12:33:21.844[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [7]:
# Train the model
trainer.loop()

100%|[38;2;30;71;6m██████████[0m| 100/100 [00:00<00:00, 759.21it/s]
[32m2026-02-21 12:33:22.185[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.9073 test 0.6600 metric ['0.7741'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:00<00:00, 924.04it/s]
[32m2026-02-21 12:33:22.344[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.5880 test 0.5949 metric ['0.7852'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:00<00:00, 927.17it/s]
[32m2026-02-21 12:33:22.502[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.5084 test 0.5152 metric ['0.8191'][0m
100%|[38;2;30;71;6m██████████[0m| 3/3 [00:00<00:00,  5.95it/s]


In [8]:
# Using the entire train and valid set

settings.train_steps = len(train)
settings.valid_steps = len(valid)


In [9]:
trainer.loop()

100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 907.23it/s]
[32m2026-02-21 12:33:23.622[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m175[0m - [1mResuming epochs from previous training at 3[0m
[32m2026-02-21 12:33:23.622[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.4124 test 0.4151 metric ['0.8456'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 941.31it/s]
[32m2026-02-21 12:33:24.693[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 4 train 0.3547 test 0.3702 metric ['0.8673'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 949.54it/s]
[32m2026-02-21 12:33:25.757[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 5 train 0.3181 test 0.3506 metric ['0.8757'][0m
100%|[38;2;30;71;6m██████████[0m| 3/3 [00:03<00:00,  1.08s/it]


### Using the entire training and validation set.

The trainer is remembering the previous epochs. So we need to create a new trainer.

The same is true for the model?

In [10]:
model2 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer2 = Trainer(
    model=model2,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:33:25.763[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123325[0m
[32m2026-02-21 12:33:25.763[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [11]:
trainer2.loop()

100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 795.01it/s]
[32m2026-02-21 12:33:27.027[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5126 test 0.4172 metric ['0.8490'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 931.97it/s]
[32m2026-02-21 12:33:28.108[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3671 test 0.3844 metric ['0.8612'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 919.51it/s]
[32m2026-02-21 12:33:29.202[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3302 test 0.3867 metric ['0.8621'][0m
[32m2026-02-21 12:33:29.202[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__call__[0m:[36m252[0m - [1mbest loss: 0.3844, current loss 0.3867.Counter 1/10.[0m
100%|[38;2;30;71;6m██████████[0m| 3/3 [00:03<00:00,  1.14s/it]


It is interesting to see that the new trainer with the complete dataset performs better than trainer 1, that is previously trained with an incomplete dataset. In total the model has seen more data but still performs worse. How good can we make the model by giving it the same. There is data within the epoch, so the number of train and test sets, but also the amount of epochs. The amount of data has a bigger impact than the number of epochs. Which is reasonable to assume since the more data it sees the more information the model gets.\

### Question:

How does the streamer work?

The streamer will shuffle the serve index each epoch. When not all images are served each epoch the next time it will not continue on the unseen images but will randomly serve a new set of images. If you want to serve the entire stack you need to make sure the training steps are large enough to serve the entire stack. 

In [12]:
# changing the amount of epochs
settings.epochs = 10

model3 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer3 = Trainer(
    model=model3,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:33:29.208[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123329[0m
[32m2026-02-21 12:33:29.209[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [13]:
trainer3.loop()

100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 845.48it/s]
[32m2026-02-21 12:33:30.451[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5076 test 0.4343 metric ['0.8381'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 911.70it/s]
[32m2026-02-21 12:33:31.562[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3631 test 0.4017 metric ['0.8485'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 815.43it/s]
[32m2026-02-21 12:33:32.788[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3303 test 0.3986 metric ['0.8589'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 832.94it/s]
[32m2026-02-21 12:33:33.995[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3031 test 0.3464 metric ['0.8774'][0m
100%|[38;2;30;71;6m██████████[0m| 

Increasing the number of epoch does not result in better performance both in accuracy and loss. Most probably the model overfits the training data by memorizing it. You see that training loss go down to zero while the test loss increases.

The question is how to stop the model early.

Using patience:

In [14]:
settings.earlystop_kwargs = {"patience": 1}

model4 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer4 = Trainer(
    model=model4,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:33:41.609[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123341[0m
[32m2026-02-21 12:33:41.610[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [15]:
trainer4.loop()

100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 731.69it/s]
[32m2026-02-21 12:33:42.988[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5122 test 0.4301 metric ['0.8444'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 877.49it/s]
[32m2026-02-21 12:33:44.133[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3653 test 0.3680 metric ['0.8698'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 854.62it/s]
[32m2026-02-21 12:33:45.310[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3290 test 0.3493 metric ['0.8711'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 743.60it/s]
[32m2026-02-21 12:33:46.665[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3017 test 0.3486 metric ['0.8728'][0m
100%|[38;2;30;71;6m██████████[0m| 

### Changing the batch size



In [16]:
streamers2 = fashionfactory.create_datastreamer(batchsize=4, preprocessor=preprocessor)
train2 = streamers2["train"]
valid2 = streamers2["valid"]
trainstreamer2 = train2.stream()
validstreamer2 = valid2.stream()

[32m2026-02-21 12:33:48.152[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist[0m
[32m2026-02-21 12:33:48.153[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


In [17]:
len(train2), len(valid2)

(15000, 2500)

In [18]:
settings.train_steps = len(train2)
settings.valid_steps = len(valid2)

In [19]:
model5 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer5 = Trainer(
    model=model5,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer2,
    validdataloader=validstreamer2,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:33:48.214[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123348[0m
[32m2026-02-21 12:33:48.215[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [20]:
trainer5.loop()

100%|[38;2;30;71;6m██████████[0m| 15000/15000 [00:10<00:00, 1381.56it/s]
[32m2026-02-21 12:33:59.357[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.4907 test 0.4364 metric ['0.8361'][0m
100%|[38;2;30;71;6m██████████[0m| 15000/15000 [00:10<00:00, 1363.66it/s]
[32m2026-02-21 12:34:10.636[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3863 test 0.4508 metric ['0.8350'][0m
[32m2026-02-21 12:34:10.637[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__call__[0m:[36m252[0m - [1mbest loss: 0.4364, current loss 0.4508.Counter 1/1.[0m
[32m2026-02-21 12:34:10.637[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mloop[0m:[36m103[0m - [1mInterrupting loop due to early stopping patience.[0m
[32m2026-02-21 12:34:10.637[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mloop[0m:[36m108[0m - [1mearly_stopping_save was false, using latest model.Se

### Why does the training take longer?

The training of the model with batch size 4 takes considerably longer. Since there are a lot more training steps and matrix multiplication of 4 and 64 images roughly takes the same amount of time. The computational overhead is drastically increased.

### Why does the model perform worse?

This is due to the way the optimizer works, and the amount of times the weights gets updated.

### Changing the batch size to 128

In [21]:
streamers3 = fashionfactory.create_datastreamer(batchsize=128, preprocessor=preprocessor)
train3 = streamers3["train"]
valid3 = streamers3["valid"]
trainstreamer3 = train3.stream()
validstreamer3 = valid3.stream()

[32m2026-02-21 12:34:10.644[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist[0m
[32m2026-02-21 12:34:10.644[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


In [22]:
len(train3), len(valid3)

(468, 78)

In [23]:
settings.train_steps = len(train3)
settings.valid_steps = len(valid3)

In [24]:
model6 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer6 = Trainer(
    model=model6,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer3,
    validdataloader=validstreamer3,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:34:10.716[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123410[0m
[32m2026-02-21 12:34:10.717[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [25]:
trainer6.loop()

  0%|[38;2;30;71;6m          [0m| 0/10 [00:00<?, ?it/s]

100%|[38;2;30;71;6m██████████[0m| 468/468 [00:00<00:00, 536.89it/s]
[32m2026-02-21 12:34:11.713[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5514 test 0.4560 metric ['0.8367'][0m
100%|[38;2;30;71;6m██████████[0m| 468/468 [00:00<00:00, 599.57it/s]
[32m2026-02-21 12:34:12.565[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3808 test 0.4143 metric ['0.8503'][0m
100%|[38;2;30;71;6m██████████[0m| 468/468 [00:00<00:00, 633.10it/s]
[32m2026-02-21 12:34:13.378[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3397 test 0.3662 metric ['0.8692'][0m
100%|[38;2;30;71;6m██████████[0m| 468/468 [00:00<00:00, 605.76it/s]
[32m2026-02-21 12:34:14.222[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3124 test 0.3550 metric ['0.8690'][0m
100%|[38;2;30;71;6m██████████[0m| 

The model performs slightly better, in both accuracy and loss. Furthermore, it required an additional epoch before early stopping was called. Will doubling increase performance?

In [26]:
streamers4 = fashionfactory.create_datastreamer(batchsize=256, preprocessor=preprocessor)
train4 = streamers4["train"]
valid4 = streamers4["valid"]
trainstreamer4 = train4.stream()
validstreamer4 = valid4.stream()

[32m2026-02-21 12:34:16.315[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist[0m
[32m2026-02-21 12:34:16.316[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


In [27]:
settings.train_steps = len(train4)
settings.valid_steps = len(valid4)

In [28]:
model7 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer7 = Trainer(
    model=model7,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer4,
    validdataloader=validstreamer4,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 12:34:16.339[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-123416[0m
[32m2026-02-21 12:34:16.340[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [29]:
trainer7.loop()

100%|[38;2;30;71;6m██████████[0m| 234/234 [00:00<00:00, 322.31it/s]
[32m2026-02-21 12:34:17.188[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.6146 test 0.4532 metric ['0.8383'][0m
100%|[38;2;30;71;6m██████████[0m| 234/234 [00:00<00:00, 336.90it/s]
[32m2026-02-21 12:34:17.952[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4059 test 0.4089 metric ['0.8508'][0m
100%|[38;2;30;71;6m██████████[0m| 234/234 [00:00<00:00, 383.88it/s]
[32m2026-02-21 12:34:18.631[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3580 test 0.3790 metric ['0.8675'][0m
100%|[38;2;30;71;6m██████████[0m| 234/234 [00:00<00:00, 396.54it/s]
[32m2026-02-21 12:34:19.290[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3319 test 0.3795 metric ['0.8612'][0m
[32m2026-02-21 12:34:19.291[0m | 

There appears to be a sweet spot for batch size. In this case the model took fewer weight updates. There appears to be an optimal for dataset size, model size and batch size.

### The Winning Metrics (Trainer 3)

Best Test Loss: 0.3318 (achieved at Epoch 8)

Best Test Accuracy: 0.8861 (88.61%)

batch size 64, no early stopping, full data exposure. Question is if other trainers that were not early stopped would perform better. But spending 6 weeks checking everything is not suitable now.

In [36]:

fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
preprocessor = BasePreprocessor()
streamers = fashionfactory.create_datastreamer(batchsize=64, preprocessor=preprocessor) # batch size 64
train = streamers["train"]
valid = streamers["valid"]
trainstreamer = train.stream()
validstreamer = valid.stream()

settings2 = TrainerSettings(
    epochs=10, # no early stopping
    metrics=[accuracy],
    logdir="modellogs",
    train_steps=len(train), #full data exposure
    valid_steps=len(valid), #full data exposure
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.TOML],
    earlystop_kwargs=None
)

[32m2026-02-21 13:22:36.977[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist[0m
[32m2026-02-21 13:22:36.979[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


In [37]:
model8 = NeuralNetwork(
    num_classes=10, units1=256, units2=256)

trainer8 = Trainer(
    model=model8,
    settings=settings2,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
)

[32m2026-02-21 13:22:38.758[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-132238[0m


In [38]:
trainer8.loop()

100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 859.53it/s]
[32m2026-02-21 13:22:45.222[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5035 test 0.4573 metric ['0.8272'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 910.74it/s]
[32m2026-02-21 13:22:46.329[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3665 test 0.3800 metric ['0.8593'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 901.53it/s]
[32m2026-02-21 13:22:47.462[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3304 test 0.3825 metric ['0.8623'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 800.75it/s]
[32m2026-02-21 13:22:48.710[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3072 test 0.3456 metric ['0.8736'][0m
100%|[38;2;30;71;6m██████████[0m| 

In [42]:
learning_rates = [1e-2, 1e-3, 1e-4]

for lr in learning_rates:

    settings2.optimizer_kwargs["lr"] = lr

    lr_model = NeuralNetwork(
        num_classes=10, units1=256, units2=256
    )

    lr_trainer = Trainer(
        model=lr_model,
        settings=settings2,
        loss_fn=loss_fn,
        optimizer=optim.Adam,
        traindataloader=trainstreamer,
        validdataloader=validstreamer,
        scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )

    lr_trainer.loop()

[32m2026-02-21 13:33:10.255[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-133310[0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 839.91it/s]
[32m2026-02-21 13:33:11.454[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5299 test 0.4901 metric ['0.8268'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 914.33it/s]
[32m2026-02-21 13:33:12.558[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4176 test 0.5718 metric ['0.8239'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 921.19it/s]
[32m2026-02-21 13:33:13.653[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.4004 test 0.4380 metric ['0.8464'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 851.44it/s]
[32m2026-02-21 13:33:14.838[0m | [1mI

The first learning rate is too high, the test loss goes up and down. Accuracy is also not that great.
The second learning rate seems like the sweet spot. Train and test less decrease together and the accuracy is the best.
The third learning rate seems too slow. and 10 epochs are not long enough to reach the bottom of the loss function.

So best parameters up to now:

batch size 64
epoch 10 not early stopping
lr 1e-3

### Checking model size

In [44]:
settings3 = TrainerSettings(
    epochs=10, # no early stopping
    metrics=[accuracy],
    logdir="modellogs",
    train_steps=len(train), #full data exposure
    valid_steps=len(valid), #full data exposure,
    optimizer_kwargs={"lr": 1e-3},
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.TOML],
    earlystop_kwargs=None
)

In [45]:
network_configs = [
    {"units1": 256, "units2": 256},
    {"units1": 256, "units2": 128},
    {"units1": 256, "units2": 64},
    {"units1": 256, "units2": 32},
]

for config in network_configs:

    model = NeuralNetwork(
        num_classes=10, units1=config["units1"], units2=config["units2"])

    trainer8 = Trainer(
        model=model,
        settings=settings3,
        loss_fn=loss_fn,
        optimizer=optim.Adam,
        traindataloader=trainstreamer,
        validdataloader=validstreamer,
        scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )

    trainer8.loop()

[32m2026-02-21 13:53:02.161[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-135302[0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 920.53it/s]
[32m2026-02-21 13:53:03.265[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5142 test 0.4325 metric ['0.8457'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 964.42it/s]
[32m2026-02-21 13:53:04.315[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3667 test 0.3976 metric ['0.8576'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 926.16it/s]
[32m2026-02-21 13:53:05.404[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3293 test 0.3577 metric ['0.8715'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 902.74it/s]
[32m2026-02-21 13:53:06.520[0m | [1mI

The larger model marginally performs better with the lowest loss and highest accuracy. However, the differences are very small.

### Creating a larger network

In [48]:
class NeuralNetwork_3_deep(nn.Module):
    def __init__(self, num_classes: int, units1: int, units2: int, units3: int) -> None:
        super().__init__()
        self.num_classes = num_classes
        self.units1 = units1
        self.units2 = units2
        self.units3 = units3
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, units1),
            nn.ReLU(),
            nn.Linear(units1, units2),
            nn.ReLU(),
            nn.Linear(units2, units3),
            nn.ReLU(),
            nn.Linear(units3, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [49]:
model_3_deep = NeuralNetwork_3_deep(
    num_classes=10, units1=256, units2=128, units3=64)

In [50]:
trainer_3_deep = Trainer(
        model=model_3_deep,
        settings=settings3,
        loss_fn=loss_fn,
        optimizer=optim.Adam,
        traindataloader=trainstreamer,
        validdataloader=validstreamer,
        scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )

trainer_3_deep.loop()

[32m2026-02-21 14:05:18.595[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/20260221-140518[0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 930.30it/s]
[32m2026-02-21 14:05:19.690[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.5611 test 0.4437 metric ['0.8409'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 998.38it/s]
[32m2026-02-21 14:05:20.707[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.3785 test 0.3849 metric ['0.8612'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1018.53it/s]
[32m2026-02-21 14:05:21.704[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3376 test 0.3787 metric ['0.8662'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 988.43it/s]
[32m2026-02-21 14:05:22.729[0m | [1m

Adding an additional layer does not automatically result in better performance. It also appears that the model is overfitting. Train loss gets lower but test loss increases.

### Hypothesis

Larger networks have more weights, meaning each gradient update changes more parameters simultaneously. I expect that large networks are more sensitive to high learning rates, requiring a smaller learning rate to train stably compared to smaller networks.

### The experiment

Testing the influence of learning rate on network size. Choosing the following 3 networks

Small = 64-64 ± 55k parameters
Medium = 256-256 ± 270k parameters
Large = 1024-1024 ± 1800 k parameters

Batch size = 64 -> 937 batches per epoch

Number of epochs = 50

Learning rates 1 = [0.01, 0.001, 0.0001]
Learning rates 2 = [1.0, 0.1, 0.01]

Optimizer = SGD to ensure the learning rate remains a controlled independent variable. Adam and others change the learning rate internally.

Scheduler = None, the scheduler also changes the learning rate, therefore it is disabled.

Loss function = CrossEntropyLoss

### Results

Testing with the initial learning rate range (0.01 to 0.00001) revealed that 20 epochs was insufficient for the models to converge, so this was increased to 50 epochs. At 50 epochs, most configurations converged, with the exception of the very lowest learning rate (0.00001), which was still progressing but too slow to be practically useful. Importantly, none of the configurations in this range showed unstable training, meaning this range was too conservative to test the hypothesis.

A second set of learning rates (1.0, 0.5, 0.1, 0.01) was therefore used to push the models into a regime where instability could occur and the effect of network size on learning rate sensitivity could be observed.

At a learning rate of 1.0, all three networks failed to learn meaningfully. The large network collapsed entirely, outputting NaN for every epoch. The small network stagnated at a loss of 2.3 across all epochs, which corresponds to random chance for a 10-class classification problem (log(10) ≈ 2.3), meaning it learned nothing. The medium network showed some movement, reaching a loss of 1.7, but this is still far from useful performance. A learning rate of 1.0 is too aggressive for all network sizes.

At a learning rate of 0.1, all three network sizes show meaningful learning. Training loss improves steadily across all models. However, test loss decreases initially but begins rising after around epoch 20 and does not recover, a classic sign of overfitting. The effect is most pronounced in the largest network, which ends with the highest test loss of the three at around 0.43, compared to 0.40 for the small network. This is expected behavior: larger networks have higher capacity and are therefore more prone to memorizing the training data when no regularization is applied. The unsmoothed loss curves also reveal more erratic behavior in the larger network, with more pronounced spikes throughout training. This could be an early indication of learning rate sensitivity.

At lr=0.01 the picture is considerably cleaner. Test loss decreases smoothly for all three networks without the upward turn seen at lr=0.1, and while some minor spiking is still visible in the unsmoothed curves, it is far less pronounced than at the higher learning rate. All three network sizes perform very similarly, with final test losses clustered tightly between 0.33 and 0.36, and the larger network is marginally the best of the three. 

### Conclusion

The results do not support the hypothesis. All three network sizes failed at lr=1.0 and showed overfitting at lr=0.1, regardless of the number of parameters. At lr=0.01, all three converged to roughly similar performance. This suggests that the learning rate thresholds were similar across network sizes, meaning larger networks did not require a meaningfully lower learning rate than smaller ones.

In [64]:
learning_rates = [1.0, 0.5, 0.1, 0.01]

network_configs = [
    {"units1": 64, "units2": 64},
    {"units1": 256, "units2": 256},
    {"units1": 1024, "units2": 1024},
]

fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
preprocessor = BasePreprocessor()
streamers = fashionfactory.create_datastreamer(batchsize=64, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]
trainstreamer = train.stream()
validstreamer = valid.stream()

accuracy = metrics.Accuracy()

loss_fn = torch.nn.CrossEntropyLoss()

settings = TrainerSettings(
    epochs=50,
    metrics=[accuracy],
    logdir="modellogs",
    train_steps=len(train),
    valid_steps=len(valid),
    optimizer_kwargs={"lr": lr},
    earlystop_kwargs=None,
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.TOML],
)

[32m2026-02-21 16:42:24.939[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist[0m
[32m2026-02-21 16:42:24.941[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/stevenbontius/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


In [65]:
for config in network_configs:
    for lr in learning_rates:
        u1, u2 = config["units1"], config["units2"]
        run_name = f"u1_{u1}_u2_{u2}_lr_{lr}"
        print (f"Running: {run_name}")

        logdir = f"modellogs/{run_name}"

        model = NeuralNetwork(
            num_classes=10, units1=u1, units2=u2)
        
        settings.logdir = logdir

        settings.optimizer_kwargs["lr"] = lr

        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optim.SGD,
            traindataloader=trainstreamer,
            validdataloader=validstreamer,
            scheduler=None
        )

        trainer.loop()   
        
        

[32m2026-02-21 16:42:27.123[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to modellogs/u1_64_u2_64_lr_1.0/20260221-164227[0m


Running: u1_64_u2_64_lr_1.0


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1449.43it/s]
[32m2026-02-21 16:42:27.849[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.2963 test 2.3058 metric ['0.1000'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1719.40it/s]
[32m2026-02-21 16:42:28.463[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 2.3065 test 2.3086 metric ['0.1001'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1724.44it/s]
[32m2026-02-21 16:42:29.077[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 2.3065 test 2.3057 metric ['0.1001'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1699.25it/s]
[32m2026-02-21 16:42:29.699[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 2.3065 test 2.3060 metric ['0.1001'][0m
100%|[38;2;30;71;6m██████████[

Running: u1_64_u2_64_lr_0.5


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1175.51it/s]
[32m2026-02-21 16:43:04.043[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.7509 test 0.5087 metric ['0.8181'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1342.36it/s]
[32m2026-02-21 16:43:04.815[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4768 test 0.4685 metric ['0.8361'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1453.76it/s]
[32m2026-02-21 16:43:05.537[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.4268 test 0.4202 metric ['0.8505'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1567.95it/s]
[32m2026-02-21 16:43:06.207[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.4038 test 0.4811 metric ['0.8162'][0m
100%|[38;2;30;71;6m██████████[

Running: u1_64_u2_64_lr_0.1


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1163.48it/s]
[32m2026-02-21 16:43:42.080[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.7010 test 0.5455 metric ['0.7944'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 971.36it/s]
[32m2026-02-21 16:43:43.141[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4463 test 0.4338 metric ['0.8442'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1319.89it/s]
[32m2026-02-21 16:43:43.936[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3967 test 0.4077 metric ['0.8538'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1337.46it/s]
[32m2026-02-21 16:43:44.713[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3736 test 0.4109 metric ['0.8504'][0m
100%|[38;2;30;71;6m██████████[0

Running: u1_64_u2_64_lr_0.01


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1643.28it/s]
[32m2026-02-21 16:44:21.226[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 1.4663 test 0.8739 metric ['0.6846'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1521.62it/s]
[32m2026-02-21 16:44:21.922[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.7362 test 0.6695 metric ['0.7575'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1538.79it/s]
[32m2026-02-21 16:44:22.610[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.6076 test 0.5922 metric ['0.7881'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1513.00it/s]
[32m2026-02-21 16:44:23.309[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.5461 test 0.5471 metric ['0.8050'][0m
100%|[38;2;30;71;6m██████████[

Running: u1_256_u2_256_lr_1.0


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1014.41it/s]
[32m2026-02-21 16:44:58.028[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 1.9131 test 1.6537 metric ['0.3034'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 852.03it/s]
[32m2026-02-21 16:44:59.230[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 1.8202 test 1.7164 metric ['0.2004'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 990.72it/s]
[32m2026-02-21 16:45:00.277[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 1.8120 test 1.7026 metric ['0.2003'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1148.84it/s]
[32m2026-02-21 16:45:01.195[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 1.8215 test 1.7119 metric ['0.1995'][0m
100%|[38;2;30;71;6m██████████[0m

Running: u1_256_u2_256_lr_0.5


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1275.33it/s]
[32m2026-02-21 16:45:43.769[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.7175 test 0.4853 metric ['0.8235'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1120.88it/s]
[32m2026-02-21 16:45:44.697[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4393 test 0.4829 metric ['0.8225'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1192.54it/s]
[32m2026-02-21 16:45:45.575[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3845 test 0.4327 metric ['0.8435'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1067.90it/s]
[32m2026-02-21 16:45:46.546[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3573 test 0.3836 metric ['0.8617'][0m
100%|[38;2;30;71;6m██████████[

Running: u1_256_u2_256_lr_0.1


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1281.97it/s]
[32m2026-02-21 16:46:29.555[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.6738 test 0.4726 metric ['0.8344'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1126.62it/s]
[32m2026-02-21 16:46:30.478[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4319 test 0.4447 metric ['0.8393'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1283.28it/s]
[32m2026-02-21 16:46:31.305[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3817 test 0.4336 metric ['0.8421'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1308.46it/s]
[32m2026-02-21 16:46:32.106[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3536 test 0.3862 metric ['0.8599'][0m
100%|[38;2;30;71;6m██████████[

Running: u1_256_u2_256_lr_0.01


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1156.75it/s]
[32m2026-02-21 16:47:12.015[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 1.3361 test 0.8084 metric ['0.7030'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1327.79it/s]
[32m2026-02-21 16:47:12.805[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.6931 test 0.6387 metric ['0.7735'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1326.47it/s]
[32m2026-02-21 16:47:13.594[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.5792 test 0.5644 metric ['0.8042'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:00<00:00, 1313.07it/s]
[32m2026-02-21 16:47:14.391[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.5238 test 0.5314 metric ['0.8133'][0m
100%|[38;2;30;71;6m██████████[

Running: u1_1024_u2_1024_lr_1.0


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:02<00:00, 428.19it/s]
[32m2026-02-21 16:47:55.684[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train nan test nan metric ['0.1000'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 473.09it/s]
[32m2026-02-21 16:47:57.822[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train nan test nan metric ['0.1001'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:02<00:00, 465.52it/s]
[32m2026-02-21 16:48:00.124[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train nan test nan metric ['0.1002'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:02<00:00, 372.39it/s]
[32m2026-02-21 16:48:02.788[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train nan test nan metric ['0.0999'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:02<00:00, 41

Running: u1_1024_u2_1024_lr_0.5


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 647.68it/s]
[32m2026-02-21 16:49:29.372[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.6561 test 0.5040 metric ['0.8210'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 527.09it/s]
[32m2026-02-21 16:49:31.279[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.5859 test 0.4826 metric ['0.8230'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 650.21it/s]
[32m2026-02-21 16:49:32.849[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.4090 test 0.4076 metric ['0.8540'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 574.78it/s]
[32m2026-02-21 16:49:34.609[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3570 test 0.3834 metric ['0.8641'][0m
100%|[38;2;30;71;6m██████████[0m| 

Running: u1_1024_u2_1024_lr_0.1


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 538.15it/s]
[32m2026-02-21 16:50:57.103[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.6322 test 0.4599 metric ['0.8341'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 618.64it/s]
[32m2026-02-21 16:50:58.750[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.4151 test 0.4262 metric ['0.8451'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 589.98it/s]
[32m2026-02-21 16:51:00.480[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3655 test 0.4059 metric ['0.8534'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 474.65it/s]
[32m2026-02-21 16:51:02.607[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.3392 test 0.3688 metric ['0.8669'][0m
100%|[38;2;30;71;6m██████████[0m| 

Running: u1_1024_u2_1024_lr_0.01


100%|[38;2;30;71;6m██████████[0m| 937/937 [00:02<00:00, 468.48it/s]
[32m2026-02-21 16:52:34.432[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 1.2112 test 0.7573 metric ['0.7247'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 517.33it/s]
[32m2026-02-21 16:52:36.411[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.6555 test 0.6063 metric ['0.7872'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:01<00:00, 506.75it/s]
[32m2026-02-21 16:52:38.410[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.5523 test 0.5512 metric ['0.8075'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:02<00:00, 361.00it/s]
[32m2026-02-21 16:52:41.229[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.5054 test 0.5145 metric ['0.8183'][0m
100%|[38;2;30;71;6m██████████[0m| 