In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fl_g13.config import RAW_DATA_DIR
from torchvision import datasets, transforms

from fl_g13.modeling import train
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

[32m2025-04-18 16:01:33.783[0m | [1mINFO    [0m | [36mfl_g13.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/massimiliano/Projects/fl-g13[0m


### Load data

In [3]:
transform = transforms.Compose([
    transforms.ToTensor()
])
cifar100_train = datasets.CIFAR100(root=RAW_DATA_DIR, train=True, download=True, transform=transform)
cifar100_test = datasets.CIFAR100(root=RAW_DATA_DIR, train=False, download=True, transform=transform)

### Train and save model

In [4]:
class TinyCNN(nn.Module):
    def __init__(self, num_classes=100):
        super(TinyCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, num_classes)

        # Store configuration for later loading
        # This is a bit of a hack, but we need to store the number of classes
        self._config = {
            "num_classes": num_classes,
            # In fact, the followings could be avoided as from_config loads only the num_classes
            "conv1_out_channels": 16,
            "conv2_out_channels": 32,
            "fc1_in_features": 32 * 8 * 8,
        }

    def forward(self, x):
        x = F.relu(self.conv1(x))     # -> [B, 16, 32, 32]
        x = F.max_pool2d(x, 2)        # -> [B, 16, 16, 16]
        x = F.relu(self.conv2(x))     # -> [B, 32, 16, 16]
        x = F.max_pool2d(x, 2)        # -> [B, 32, 8, 8]
        x = x.view(x.size(0), -1)     # -> [B, 32*8*8]
        x = self.fc1(x)               # -> [B, 100]
        return x

    # Now we need to be careful to define how to load from config
    @classmethod
    def from_config(cls, config):
        return cls(num_classes=config["num_classes"])

In [5]:
checkpoint_dir = "/home/massimiliano/Projects/fl-g13/checkpoints"

# Parameters
batch_size  = 32
start_epoch = 1
num_epochs  = 2
save_every  = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataloader = torch.utils.data.DataLoader(cifar100_train, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(cifar100_test, batch_size=batch_size, shuffle=True)

model = TinyCNN(100)
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.04)
criterion = torch.nn.CrossEntropyLoss()

In [6]:
# This will train the model, using an automatically generated name
_, _, _, _ = train(
    checkpoint_dir=checkpoint_dir,
    name="", # Will automatically generate a name for the model
    train_dataloader=train_dataloader,
    val_dataloader=test_dataloader,
    criterion=criterion,
    start_epoch=start_epoch,
    num_epochs=num_epochs,
    save_every=save_every,
    model=model,
    optimizer=optimizer,
    scheduler=None,
    verbose=False,
)

No prefix/name for the model was provided, choosen prefix/name: chirpy_charizard_32

🚀 Epoch 1/2 (50.00%) Completed
	📊 Training Loss: 4.0760
	✅ Training Accuracy: 9.07%
	⏳ Elapsed Time: 4.09s | ETA: 4.09s
	🕒 Completed At: 16:01
🔍 Validation Results:
	📉 Validation Loss: 3.7944
	🎯 Validation Accuracy: 14.14%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/chirpy_charizard_32_TinyCNN_epoch_1.pth

🚀 Epoch 2/2 (100.00%) Completed
	📊 Training Loss: 3.6538
	✅ Training Accuracy: 16.70%
	⏳ Elapsed Time: 3.86s | ETA: 0.00s
	🕒 Completed At: 16:01
🔍 Validation Results:
	📉 Validation Loss: 3.5973
	🎯 Validation Accuracy: 17.58%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/chirpy_charizard_32_TinyCNN_epoch_2.pth



In [7]:
# This will train again the same model, but with a custom name
# The epochs will still start from 1
# NOTE: If a checkpoint with the exact same model name, 
# model class and epoch number exists, it will be overwritten!!!
_, _, _, _ = train(
    checkpoint_dir=checkpoint_dir,
    name="ditto", # Setting a name for the model
    start_epoch=start_epoch,
    num_epochs=num_epochs,
    save_every=save_every,
    train_dataloader=train_dataloader,
    val_dataloader=test_dataloader,
    model=model, # Use the same model as before (partially pre-trained)
    criterion=criterion,
    optimizer=optimizer,
    scheduler=None,
    verbose=False,
)

Prefix/name for the model was provided: ditto

🚀 Epoch 1/2 (50.00%) Completed
	📊 Training Loss: 3.4679
	✅ Training Accuracy: 20.11%
	⏳ Elapsed Time: 4.01s | ETA: 4.01s
	🕒 Completed At: 16:01
🔍 Validation Results:
	📉 Validation Loss: 3.4528
	🎯 Validation Accuracy: 20.69%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/ditto_TinyCNN_epoch_1.pth

🚀 Epoch 2/2 (100.00%) Completed
	📊 Training Loss: 3.3233
	✅ Training Accuracy: 22.62%
	⏳ Elapsed Time: 3.87s | ETA: 0.00s
	🕒 Completed At: 16:01
🔍 Validation Results:
	📉 Validation Loss: 3.3344
	🎯 Validation Accuracy: 22.54%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/ditto_TinyCNN_epoch_2.pth



### Resume training

In [8]:
from fl_g13.modeling import load

# Load the model from the latest checkpoint (a specific file!)
path = f"{checkpoint_dir}/TinyCNN/ditto_TinyCNN_epoch_2.pth"
# Note: make sure to define a _config parameter and a from_config method in the model class
model2, start_epoch = load(path=path, model_class=TinyCNN, device=device, verbose=True)

🔍 Loading checkpoint from /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/ditto_TinyCNN_epoch_2.pth
📦 Model class in checkpoint: TinyCNN
⚙️  Optimizer class in checkpoint: AdamW
🔧 Model configuration: {'num_classes': 100, 'conv1_out_channels': 16, 'conv2_out_channels': 32, 'fc1_in_features': 2048}
✅ Loaded checkpoint from /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/ditto_TinyCNN_epoch_2.pth, resuming at epoch 3


In [None]:
num_epochs = 4
save_every = 2

# Now we can continue training the model (model2 now!) from the last checkpoint (which has been loaded)
# NOTE again: loading is done by the user in the cell above! It is not done automatically!
# The start_epoch is now 2 (value returned by the load function)
_, _, _, _ = train(
    checkpoint_dir=checkpoint_dir,
    name="ditto", # Use the same name as before to continue training!
    start_epoch=start_epoch, # Now start epoch is not 1 (will resume from where it was left)
    num_epochs=num_epochs, # This is not the number of epochs to reach, but how many to do starting from now!
    save_every=save_every,
    train_dataloader=train_dataloader,
    val_dataloader=test_dataloader,
    model=model2,
    criterion=criterion, # Use the same criterion as before
    optimizer=optimizer, # Use the same optimizer as before
    scheduler=None,
    verbose=False,
)

Prefix/name for the model was provided: ditto

🚀 Epoch 3/6 (50.00%) Completed
	📊 Training Loss: 3.2314
	✅ Training Accuracy: 24.69%
	⏳ Elapsed Time: 6.29s | ETA: 18.87s
	🕒 Completed At: 16:02
🔍 Validation Results:
	📉 Validation Loss: 3.3345
	🎯 Validation Accuracy: 22.54%

🚀 Epoch 4/6 (66.67%) Completed
	📊 Training Loss: 3.2314
	✅ Training Accuracy: 24.69%
	⏳ Elapsed Time: 9.58s | ETA: 19.16s
	🕒 Completed At: 16:02
🔍 Validation Results:
	📉 Validation Loss: 3.3341
	🎯 Validation Accuracy: 22.54%

💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/TinyCNN/ditto_TinyCNN_epoch_4.pth

🚀 Epoch 5/6 (83.33%) Completed
	📊 Training Loss: 3.2316
	✅ Training Accuracy: 24.69%
	⏳ Elapsed Time: 10.15s | ETA: 10.15s
	🕒 Completed At: 16:02
🔍 Validation Results:
	📉 Validation Loss: 3.3345
	🎯 Validation Accuracy: 22.54%

🚀 Epoch 6/6 (100.00%) Completed
	📊 Training Loss: 3.2316
	✅ Training Accuracy: 24.69%
	⏳ Elapsed Time: 10.25s | ETA: 0.00s
	🕒 Completed At: 16:02
🔍 Validation Results:
	