In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from fl_g13.editing.sparseSGDM import SparseSGDM
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import CosineAnnealingLR

import flwr
from flwr.simulation import run_simulation
from fl_g13.architectures import BaseDino
from fl_g13.fl_pytorch import get_client_app, get_server_app
from fl_g13.fl_pytorch import build_fl_dependencies

print(f"Flower {flwr.__version__} / PyTorch {torch.__version__}")

build_fl_dependencies() #! Remind to always put this, it will download Dino dependencies for client

[32m2025-05-15 16:49:57.423[0m | [1mINFO    [0m | [36mfl_g13.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/massimiliano/Projects/fl-g13[0m


Flower 1.17.0 / PyTorch 2.6.0+cu124
'vision_transformer.py' already exists.
'utils.py' already exists.


In [3]:
# Settings
CHECKPOINT_DIR = "/home/massimiliano/Projects/fl-g13/checkpoints"

# Model hyper-parameters
head_layers=3
head_hidden_size=512
dropout_rate=0.0
unfreeze_blocks=1

# Training hyper-parameters
starting_lr = 1e-3
momentum = 0.9
weight_decay=1e-5
T_max=8
eta_min=1e-5

# Federated Training setting
batch_size = 64 # Batch size for training #! Let's stick to 64 to make training fit also on RTX 3070
local_epochs = 2 # Number of local epochs per client
number_of_rounds = 5 # Total number of federated learning rounds
fraction_fit = 1 # Fraction of clients participating in training per round
fraction_evaluate = 0.1 # Fraction of clients participating in evaluation per round
number_of_clients = 3 # Total number of clients in the simulation
min_num_clients = 2 # Minimum number of clients required for training and evaluation
partition_type = "iid" # Partitioning strategy for the dataset (e.g., "iid" or "shard")
num_shards_per_partition = 6 # Number of shards per partition (used when partition_type is "shard")
use_wandb = False # Whether to use Weights & Biases (wandb) for experiment tracking (#!TODO, double check it works)

# Device settings
device = "cuda" if torch.cuda.is_available() else "cpu"
backend_config = {
    "client_resources": {
        "num_cpus": 1, 
        "num_gpus": 0
    }
}

# When running on GPU, assign an entire GPU for each client
# Refer to Flower framework documentation for more details about Flower simulations
# and how to set up the `backend_config`
if device == "cuda":
    backend_config["client_resources"] = {"num_cpus": 1, "num_gpus": 1}

print(f"Training on {device}")

Training on cuda


In [4]:
# Model
model = BaseDino(
    head_layers=head_layers, 
    head_hidden_size=head_hidden_size, 
    dropout_rate=dropout_rate, 
    unfreeze_blocks=unfreeze_blocks
    )
model.to(device)

mask = [torch.ones_like(p, device=p.device) for p in model.parameters()] # Must be done AFTER the model is moved to CUDA
optimizer = SparseSGDM(
    model.parameters(),
    mask=mask,
    lr=starting_lr,
    momentum=momentum,
    weight_decay=weight_decay
    )
scheduler = CosineAnnealingLR(
    optimizer=optimizer, 
    T_max=T_max, 
    eta_min=eta_min
    )
criterion = CrossEntropyLoss()

client_app = get_client_app(
    model=model, 
    criterion=criterion, 
    optimizer=optimizer, 
    scheduler=scheduler,
    device=device, 
    partition_type=partition_type, 
    batch_size=batch_size,
    num_shards_per_partition=num_shards_per_partition,
    local_epochs=local_epochs,
    model_editing=False,
)
server_app = get_server_app(
    checkpoint_dir=CHECKPOINT_DIR,
    prefix='aron', #! Introduced, you are force to pass this to avoid overwrites, if you pass an already used name it will load the most recent checkpoint
    model_class=model.__class__,
    model_config=model.get_config(), 
    optimizer=optimizer, 
    criterion=criterion, 
    scheduler=scheduler,
    device=device, 
    save_every=1,
    save_with_model_dir=False, #! Introduced: will save under {checkpoint path provided}/BaseDino/ dir if set to True
    num_rounds=number_of_rounds, 
    fraction_fit=fraction_fit,
    fraction_evaluate=fraction_evaluate,
    min_fit_clients=min_num_clients,
    min_evaluate_clients=min_num_clients,
    min_available_clients=number_of_clients,
    use_wandb=False,
    wandb_config=None,
)

Using cache found in /home/massimiliano/.cache/torch/hub/facebookresearch_dino_main


⚠️ No checkpoint found at /home/massimiliano/Projects/fl-g13/checkpoints. Creating a new model.


Using cache found in /home/massimiliano/.cache/torch/hub/facebookresearch_dino_main


In [None]:
run_simulation(
    client_app=client_app,
    server_app=server_app,
    num_supernodes=number_of_clients,
    backend_config=backend_config,
)

[Server] Server on device: cuda:0
[Server] CUDA available in client: True


[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=5, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters


Using strategy 'CustomFedAvg' (default option)
[Server Eval Round 0] Model device: cuda:0
[Server Eval Round 0] CUDA available in server eval: True


Eval progress: 100%|██████████| 313/313 [00:22<00:00, 13.69batch/s]
[92mINFO [0m:      [Round 0] Centralized Evaluation - Loss: 6.5607, Metrics: {'centralized_accuracy': 0.0094}
[92mINFO [0m:      initial parameters (loss, other metrics): 6.560712768627813, {'centralized_accuracy': 0.0094}
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 3)
[36m(ClientAppActor pid=86603)[0m 2025-05-15 16:50:24.786 | INFO     | fl_g13.config:<module>:11 - PROJ_ROOT path is: /home/massimiliano/Projects/fl-g13


[36m(ClientAppActor pid=86603)[0m [Client] Client on device: cuda:0
[36m(ClientAppActor pid=86603)[0m [Client] CUDA available in client: True
[36m(ClientAppActor pid=86603)[0m No prefix/name for the model was provided, choosen prefix/name: quirky_pidgey_43
[36m(ClientAppActor pid=86603)[0m 




[36m(ClientAppActor pid=86603)[0m 🚀 Epoch 1/2 (50.00%) Completed
[36m(ClientAppActor pid=86603)[0m 	📊 Training Loss: 2.7556
[36m(ClientAppActor pid=86603)[0m 	✅ Training Accuracy: 34.44%
[36m(ClientAppActor pid=86603)[0m 	⏳ Elapsed Time: 36.72s | ETA: 36.72s
[36m(ClientAppActor pid=86603)[0m 	🕒 Completed At: 16:51
[36m(ClientAppActor pid=86603)[0m 
[36m(ClientAppActor pid=86603)[0m 🚀 Epoch 2/2 (100.00%) Completed
[36m(ClientAppActor pid=86603)[0m 	📊 Training Loss: 1.2625
[36m(ClientAppActor pid=86603)[0m 	✅ Training Accuracy: 63.92%
[36m(ClientAppActor pid=86603)[0m 	⏳ Elapsed Time: 37.46s | ETA: 0.00s
[36m(ClientAppActor pid=86603)[0m 	🕒 Completed At: 16:51
[36m(ClientAppActor pid=86603)[0m 
[36m(ClientAppActor pid=86603)[0m [Client] Client on device: cuda:0
[36m(ClientAppActor pid=86603)[0m [Client] CUDA available in client: True
[36m(ClientAppActor pid=86603)[0m No prefix/name for the model was provided, choosen prefix/name: funky_caterpie_29
[36m(Clie

[92mINFO [0m:      aggregate_fit: received 3 results and 0 failures


[36m(ClientAppActor pid=86603)[0m 🚀 Epoch 2/2 (100.00%) Completed
[36m(ClientAppActor pid=86603)[0m 	📊 Training Loss: 1.2510
[36m(ClientAppActor pid=86603)[0m 	✅ Training Accuracy: 64.41%
[36m(ClientAppActor pid=86603)[0m 	⏳ Elapsed Time: 36.80s | ETA: 0.00s
[36m(ClientAppActor pid=86603)[0m 	🕒 Completed At: 16:54
[36m(ClientAppActor pid=86603)[0m 


[92mINFO [0m:      [Round 1] Avg Drift: 3.3525 | Relative Drift: 0.0065
[92mINFO [0m:      [Round 1] Saving aggregated model at epoch 1...


💾 Saved checkpoint at: /home/massimiliano/Projects/fl-g13/checkpoints/fl_aron_BaseDino_epoch_1.pth
[Server Eval Round 1] Model device: cuda:0
[Server Eval Round 1] CUDA available in server eval: True


Eval progress: 100%|██████████| 313/313 [00:22<00:00, 13.71batch/s]
[92mINFO [0m:      [Round 1] Centralized Evaluation - Loss: 1.2075, Metrics: {'centralized_accuracy': 0.6586}
[92mINFO [0m:      fit progress: (1, 1.2075117809323077, {'centralized_accuracy': 0.6586}, 252.56478277600036)
[92mINFO [0m:      configure_evaluate: strategy sampled 2 clients (out of 3)


[36m(ClientAppActor pid=86603)[0m [Client] Client on device: cuda:0
[36m(ClientAppActor pid=86603)[0m [Client] CUDA available in client: True


Eval progress:   0%|          | 0/53 [00:00<?, ?batch/s]
Eval progress:   2%|▏         | 1/53 [00:00<00:08,  6.01batch/s]
Eval progress:   4%|▍         | 2/53 [00:00<00:08,  5.89batch/s]
Eval progress:   6%|▌         | 3/53 [00:00<00:08,  5.98batch/s]
Eval progress:   8%|▊         | 4/53 [00:00<00:08,  6.02batch/s]
Eval progress:   9%|▉         | 5/53 [00:00<00:07,  6.05batch/s]
Eval progress:  11%|█▏        | 6/53 [00:00<00:07,  6.05batch/s]
Eval progress:  13%|█▎        | 7/53 [00:01<00:07,  6.07batch/s]
Eval progress:  15%|█▌        | 8/53 [00:01<00:07,  6.10batch/s]
Eval progress:  17%|█▋        | 9/53 [00:01<00:07,  6.12batch/s]
Eval progress:  19%|█▉        | 10/53 [00:01<00:07,  6.10batch/s]
Eval progress:  21%|██        | 11/53 [00:01<00:06,  6.09batch/s]
Eval progress:  23%|██▎       | 12/53 [00:01<00:06,  6.13batch/s]
Eval progress:  25%|██▍       | 13/53 [00:02<00:06,  6.13batch/s]
Eval progress:  26%|██▋       | 14/53 [00:02<00:06,  6.13batch/s]
Eval progress:  28%|██▊     

[36m(ClientAppActor pid=86603)[0m [Client] Client on device: cuda:0
[36m(ClientAppActor pid=86603)[0m [Client] CUDA available in client: True


Eval progress:   2%|▏         | 1/53 [00:00<00:08,  6.10batch/s]
Eval progress:   4%|▍         | 2/53 [00:00<00:08,  6.15batch/s]
Eval progress:   6%|▌         | 3/53 [00:00<00:08,  6.07batch/s]
Eval progress:   8%|▊         | 4/53 [00:00<00:08,  6.12batch/s]
Eval progress:   9%|▉         | 5/53 [00:00<00:07,  6.15batch/s]
Eval progress:  11%|█▏        | 6/53 [00:00<00:07,  6.18batch/s]
Eval progress:  13%|█▎        | 7/53 [00:01<00:07,  6.16batch/s]
Eval progress:  15%|█▌        | 8/53 [00:01<00:07,  6.10batch/s]
Eval progress:  17%|█▋        | 9/53 [00:01<00:07,  6.11batch/s]
Eval progress:  19%|█▉        | 10/53 [00:01<00:07,  6.09batch/s]
Eval progress:  21%|██        | 11/53 [00:01<00:06,  6.06batch/s]
Eval progress:  23%|██▎       | 12/53 [00:01<00:06,  6.08batch/s]
Eval progress:  25%|██▍       | 13/53 [00:02<00:06,  6.08batch/s]
Eval progress:  26%|██▋       | 14/53 [00:02<00:06,  5.96batch/s]
Eval progress:  28%|██▊       | 15/53 [00:02<00:06,  5.93batch/s]
Eval progress:  30%

[36m(ClientAppActor pid=86603)[0m [Client] Client on device: cuda:0
[36m(ClientAppActor pid=86603)[0m [Client] CUDA available in client: True
[36m(ClientAppActor pid=86603)[0m No prefix/name for the model was provided, choosen prefix/name: silly_nidoran_21
[36m(ClientAppActor pid=86603)[0m 
