In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fl_g13.config import RAW_DATA_DIR
from torchvision import datasets, transforms

from fl_g13.modeling import train
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

[32m2025-04-18 09:37:18.151[0m | [1mINFO    [0m | [36mfl_g13.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/massimiliano/Projects/fl-g13[0m


### Load data

In [3]:
transform = transforms.Compose([
    transforms.ToTensor()
])
cifar100_train = datasets.CIFAR100(root=RAW_DATA_DIR, train=True, download=True, transform=transform)
cifar100_test = datasets.CIFAR100(root=RAW_DATA_DIR, train=False, download=True, transform=transform)

### Train and save model

In [4]:
class TinyCNN(nn.Module):
    def __init__(self, num_classes=100):
        super(TinyCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))     # -> [B, 16, 32, 32]
        x = F.max_pool2d(x, 2)        # -> [B, 16, 16, 16]
        x = F.relu(self.conv2(x))     # -> [B, 32, 16, 16]
        x = F.max_pool2d(x, 2)        # -> [B, 32, 8, 8]
        x = x.view(x.size(0), -1)     # -> [B, 32*8*8]
        x = self.fc1(x)               # -> [B, 100]
        return x

In [5]:
checkpoint_dir = "/home/massimiliano/Projects/fl-g13/checkpoints"

# Parameters
batch_size  = 32
start_epoch = 1
num_epochs  = 2
save_every  = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataloader = torch.utils.data.DataLoader(cifar100_train, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(cifar100_test, batch_size=batch_size, shuffle=True)

model = TinyCNN(100)
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.04)
criterion = torch.nn.CrossEntropyLoss()

In [6]:
# This will train the model, using an automatically generated name
train(
    checkpoint_dir=checkpoint_dir,
    prefix="", # Will automatically generate a name for the model
    train_dataloader=train_dataloader,
    val_dataloader=test_dataloader,
    criterion=criterion,
    start_epoch=start_epoch,
    num_epochs=num_epochs,
    save_every=save_every,
    model=model,
    optimizer=optimizer,
    scheduler=None,
    verbose=False,
)

No prefix/name for the model was provided, choosen prefix/name: cheeky_charmander_43

🚀 Epoch 1/2 (50.00%) Completed
	📊 Training Loss: 4.1400
	✅ Training Accuracy: 8.50%
	⏳ Elapsed Time: 4.18s | ETA: 4.18s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.8421
	🎯 Validation Accuracy: 13.36%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/cheeky_charmander_43_epoch_1.pth

🚀 Epoch 2/2 (100.00%) Completed
	📊 Training Loss: 3.6664
	✅ Training Accuracy: 16.58%
	⏳ Elapsed Time: 3.97s | ETA: 0.00s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.5802
	🎯 Validation Accuracy: 18.12%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/cheeky_charmander_43_epoch_2.pth



([4.140020949216661, 3.6664106693316634],
 [3.8421174946684427, 3.5801682068517033],
 [0.08496, 0.16576],
 [0.1336, 0.1812])

In [7]:
# This will train again the same model, but with a custom name
# The epochs will still start from 1
train(
    checkpoint_dir=checkpoint_dir,
    prefix="TinyCNN", # Setting a name for the model
    start_epoch=start_epoch,
    num_epochs=num_epochs,
    save_every=save_every,
    train_dataloader=train_dataloader,
    val_dataloader=test_dataloader,
    model=model, # Use the same model as before (partially pre-trained)
    criterion=criterion,
    optimizer=optimizer,
    scheduler=None,
    verbose=False,
)

🚀 Epoch 1/2 (50.00%) Completed
	📊 Training Loss: 3.4445
	✅ Training Accuracy: 20.49%
	⏳ Elapsed Time: 3.71s | ETA: 3.71s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.4278
	🎯 Validation Accuracy: 21.30%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN_epoch_1.pth

🚀 Epoch 2/2 (100.00%) Completed
	📊 Training Loss: 3.2926
	✅ Training Accuracy: 23.35%
	⏳ Elapsed Time: 3.88s | ETA: 0.00s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.3096
	🎯 Validation Accuracy: 23.37%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN_epoch_2.pth



([3.444468788511846, 3.292574101354705],
 [3.4278288519801423, 3.309554140407818],
 [0.20488, 0.2335],
 [0.213, 0.2337])

### Resume training

In [8]:
from fl_g13.modeling import load

# Generate untrained model and optimizer
model2 = TinyCNN(num_classes=100)
optimizer2 = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.04)
criterion2 = torch.nn.CrossEntropyLoss()

# Load the model from the latest checkpoint (a specific file!)
path = checkpoint_dir + "/TinyCNN_epoch_2.pth"
start_epoch = load(path=path, model=model2, optimizer=optimizer2, scheduler=None)

✅ Loaded checkpoint from /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN_epoch_2.pth, resuming at epoch 3


In [9]:
num_epochs = 4
save_every = 2

# Now we can continue training the model (model2 now!) from the last checkpoint (which has been loaded)
# Note again: loading is done by the user in the cell above! It is not done automatically!
# The start_epoch is now 2 (value returned by the load function)
train(
    checkpoint_dir=checkpoint_dir,
    prefix="TinyCNN", # Use the same name as before to continue training!
    start_epoch=start_epoch, # Now start epoch is not 1 (will resume from where it was left)
    num_epochs=num_epochs, # This is not the number of epochs to reach, but how many to do starting from now!
    save_every=save_every,
    train_dataloader=train_dataloader,
    val_dataloader=test_dataloader,
    model=model2,
    criterion=criterion2,
    optimizer=optimizer2,
    scheduler=None,
    verbose=False,
)

🚀 Epoch 3/6 (50.00%) Completed
	📊 Training Loss: 3.1965
	✅ Training Accuracy: 25.18%
	⏳ Elapsed Time: 5.94s | ETA: 17.81s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.3110
	🎯 Validation Accuracy: 23.37%

🚀 Epoch 4/6 (66.67%) Completed
	📊 Training Loss: 3.1967
	✅ Training Accuracy: 25.18%
	⏳ Elapsed Time: 5.91s | ETA: 11.82s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.3111
	🎯 Validation Accuracy: 23.37%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN_epoch_4.pth

🚀 Epoch 5/6 (83.33%) Completed
	📊 Training Loss: 3.1965
	✅ Training Accuracy: 25.18%
	⏳ Elapsed Time: 5.85s | ETA: 5.85s
	🕒 Completed At: 09:37
🔍 Validation Results:
	📉 Validation Loss: 3.3107
	🎯 Validation Accuracy: 23.37%

🚀 Epoch 6/6 (100.00%) Completed
	📊 Training Loss: 3.1966
	✅ Training Accuracy: 25.18%
	⏳ Elapsed Time: 5.89s | ETA: 0.00s
	🕒 Completed At: 09:38
🔍 Validation Results:
	📉 Validation Loss: 3.3099
	🎯 Validation Accuracy: 23.37%

💾 Save

([3.19649176893521,
  3.1966562527383817,
  3.1965467147314617,
  3.1965918338077617],
 [3.3109634326288875, 3.31106799707626, 3.310722441718982, 3.309893375768448],
 [0.25182, 0.25182, 0.25182, 0.25182],
 [0.2337, 0.2337, 0.2337, 0.2337])