In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
from pathlib import Path

import flwr
import torch
from flwr.simulation import run_simulation
from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingLR

from fl_g13.architectures import BaseDino
from fl_g13.fl_pytorch.client_app import get_client_app
from fl_g13.fl_pytorch.server_app import get_server_app

[32m2025-05-13 11:36:52.087[0m | [1mINFO    [0m | [36mfl_g13.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13[0m


In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# DEVICE = "cpu"
print(f"Training on {DEVICE}")
print(f"Flower {flwr.__version__} / PyTorch {torch.__version__}")
# disable_progress_bar()

Training on cuda
Flower 1.17.0 / PyTorch 2.6.0+cu118


# Login wandb

In [5]:
!pip install wandb




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
## read .env file
import dotenv

dotenv.load_dotenv()


True

In [7]:
import wandb

# login by key in .env file
WANDB_API_KEY = dotenv.dotenv_values()["WANDB_API_KEY"]
wandb.login(key=WANDB_API_KEY)

  return LooseVersion(v) >= LooseVersion(check)
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\ADMIN\_netrc
wandb: Currently logged in as: thanhnv-it23 (stefano-gamba-social-politecnico-di-torino) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

# FL

## Configs

In [8]:
DEBUG = True

In [9]:
# Model config

## Model Hyper-parameters
head_layers = 3
head_hidden_size = 512
dropout_rate = 0.0
unfreeze_blocks = 1

## Training Hyper-parameters
batch_size = 128
lr = 1e-3
momentum = 0.9
weight_decay = 1e-5
T_max = 8
eta_min = 1e-5

# FL config
K = 100
C = 0.1
J = 4
num_rounds = 30
partition_type = 'iid'

## only for partition_type = 'shard'
num_shards_per_partition = 10

## Server App config
save_every = 1
fraction_fit = C  # Sample of available clients for training
fraction_evaluate = 0.1  # Sample 50% of available clients for evaluation
min_fit_clients = 10  # Never sample less than 10 clients for training
min_evaluate_clients = 5  # Never sample less than 5 clients for evaluation
min_available_clients = 10  # Wait until all 10 clients are available
device = DEVICE
## checkpoints directory
current_path = Path.cwd()
model_save_path = current_path / f"../models/fl_dino_baseline/{partition_type}"
checkpoint_dir = model_save_path.resolve()
os.makedirs(checkpoint_dir, exist_ok=True)

## Wandb config
use_wandb = True
wandb_config = {
    # wandb param
    'name': 'FL_Dino_Baseline_iid',
    'project_name': "FL_test_chart",
    # model config param
    "fraction_fit": fraction_fit,
    "lr": lr,
    "momentum": momentum,
    'partition_type': partition_type,
    'K': K,
    'C': C,
    'J': J,
}

## simulation run config
NUM_CLIENTS = 100
MAX_PARALLEL_CLIENTS = 10

if DEBUG:
    use_wandb = True
    num_rounds = 2
    J = 2


## Define model , optimizer and loss function

In [10]:
# Model
model = BaseDino(
    head_layers=head_layers,
    head_hidden_size=head_hidden_size,
    dropout_rate=dropout_rate,
    unfreeze_blocks=unfreeze_blocks
)
model.to(DEVICE)
optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
criterion = torch.nn.CrossEntropyLoss()
scheduler = CosineAnnealingLR(
    optimizer=optimizer,
    T_max=T_max,
    eta_min=eta_min
)

Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_dino_main


## Define the ClientApp

## Build module local

Build module local such that ClientApp can use it

In [11]:
!pip install -e ..



Obtaining file:///C:/Users/ADMIN/Desktop/BACKUP/study/Italy/polito/classes/20242/deep%20learning/project/source_code/fl-g13









  Installing build dependencies: started









  Installing build dependencies: finished with status 'done'






  Checking if build backend supports build_editable: started

[notice] A new release of pip is available: 25.0.1 -> 25.1.1







  Checking if build backend supports build_editable: finished with status 'done'


[notice] To update, run: python.exe -m pip install --upgrade pip

  Getting requirements to build editable: started





  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: fl_g13
  Building editable for fl_g13 (pyproject.toml): started
  Building editable for fl_g13 (pyproject.toml): finished with status 'done'
  Created wheel for fl_g13: filename=fl_g13-0.0.1-py3-none-any.whl size=4649 sha256=a588eeccbccdbea56dbfe92e55bd612babd81e07366f39e361ccc99a5ffdec08
  Stored in directory: C:\Users\ADMIN\AppData\Local\Temp\pip-ephem-wheel-cache-h6tvvxa0\wheels\b7\e0\6d\5d22ced2ef400b314cfe74883357cc37e1e1d5275e7ba9175e
Successfully built fl_g13
Installing collected packages: fl_g13
  Attempting uninstall: fl_g13


## Create FlowerClient instances  

### Create instant of ClientApp

In [12]:
client = get_client_app(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    device=DEVICE,
    partition_type=partition_type,
    local_epochs=J,
    batch_size=batch_size,
    num_shards_per_partition=num_shards_per_partition,
    scheduler=scheduler,
    verbose=0
    # load_data_fn=load_data_clients
)

# Define the Flower ServerApp

Customize built-in strategy Federated Averaging (FedAvg) of Flower to combine hyperparams in server-side and save model for each k epoch

The strategy could also incremental training 

## Create instant of ServerApp

In [18]:
server = get_server_app(checkpoint_dir=checkpoint_dir,
                        model_class=model,
                        optimizer=optimizer,
                        criterion=criterion,
                        scheduler=scheduler,
                        num_rounds=num_rounds,
                        fraction_fit=fraction_fit,
                        fraction_evaluate=fraction_evaluate,
                        min_fit_clients=min_fit_clients,
                        min_evaluate_clients=min_evaluate_clients,
                        min_available_clients=min_available_clients,
                        device=device,
                        use_wandb=use_wandb,
                        wandb_config=wandb_config,
                        save_every=save_every,
                        prefix='fl_baseline'
                        )

🔍 Loading checkpoint from C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_dino_baseline\iid\fl_fl_baseline_BaseDino_epoch_2.pth
📦 Model class in checkpoint: BaseDino
🔧 Model configuration: {'variant': 'dino_vits16', 'dropout_rate': 0.0, 'head_hidden_size': 512, 'head_layers': 3, 'num_classes': 100, 'unfreeze_blocks': 1, 'activation_fn': 'GELU', 'pretrained': True}


Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_dino_main


➡️ Moved model to device: cuda
✅ Loaded checkpoint from C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_dino_baseline\iid\fl_fl_baseline_BaseDino_epoch_2.pth, resuming at epoch 3


# Run the training


In [19]:
# Specify the resources each of your clients need
# By default, each client will be allocated 1x CPU and 0x GPUs
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0}}

# When running on GPU, assign an entire GPU for each client
if DEVICE == "cuda":
    backend_config["client_resources"] = {"num_cpus": 1, "num_gpus": 1}
    # Refer to our Flower framework documentation for more details about Flower simulations
    # and how to set up the `backend_config`

### Download missing module for clients

Dino model,that is serialized and sent to client by server, require some modules that have to download from source code of dino model


In [20]:
import os
import urllib.request


def download_if_not_exists(file_path: str, file_url: str):
    """
    Checks if a file exists at the given path. If it does not, downloads it from the specified URL.

    Parameters:
    - file_path (str): The local path to check and save the file.
    - file_url (str): The URL from which to download the file.
    """
    if not os.path.exists(file_path):
        print(f"'{file_path}' not found. Downloading from {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_path)
            print("Download complete.")
        except Exception as e:
            print(f"Failed to download file: {e}")
    else:
        print(f"'{file_path}' already exists.")

In [21]:
download_if_not_exists("vision_transformer.py",
                       "https://raw.githubusercontent.com/facebookresearch/dino/refs/heads/main/vision_transformer.py")
download_if_not_exists("utils.py",
                       "https://raw.githubusercontent.com/facebookresearch/dino/refs/heads/main/utils.py")


'vision_transformer.py' already exists.
'utils.py' already exists.


In [22]:
# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_CLIENTS,
    backend_config=backend_config
)

[Server] Server on device: cuda:0
[Server] CUDA available in client: True


0,1
centralized_accuracy,▁▄█
centralized_eval_loss,█▂▁
decentralized_avg_eval_accuracy,▁█
decentralized_avg_eval_loss,█▁
decentralized_avg_train_accuracy,▁█
decentralized_avg_train_loss,█▁

0,1
centralized_accuracy,0.0312
centralized_eval_loss,4.52635
decentralized_avg_eval_accuracy,0.038
decentralized_avg_eval_loss,4.52311
decentralized_avg_train_accuracy,0.051
decentralized_avg_train_loss,4.47232


  self.scope.user = {"email": email}  # noqa


[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=2, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters


[Server Eval Round 0] Model device: cuda:0
[Server Eval Round 0] CUDA available in server eval: True


Eval progress: 100%|██████████| 313/313 [00:32<00:00,  9.69batch/s]
[92mINFO [0m:      [Round 0] Centralized Evaluation - Loss: 4.5264, Metrics: {'centralized_accuracy': 0.0312}
[92mINFO [0m:      initial parameters (loss, other metrics): 4.526353092620167, {'centralized_accuracy': 0.0312}
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 100)
(ClientAppActor pid=25380) 2025-05-13 11:46:22.626 | INFO     | fl_g13.config:<module>:11 - PROJ_ROOT path is: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True
(ClientAppActor pid=25380) No prefix/name for the model was provided, choosen prefix/name: groovy_rattata_17
(ClientAppActor pid=25380) 




(ClientAppActor pid=25380) 🚀 Epoch 1/2 (50.00%) Completed
(ClientAppActor pid=25380) 	📊 Training Loss: 4.6042
(ClientAppActor pid=25380) 	✅ Training Accuracy: 4.00%
(ClientAppActor pid=25380) 	⏳ Elapsed Time: 1.80s | ETA: 1.80s
(ClientAppActor pid=25380) 	🕒 Completed At: 11:46
(ClientAppActor pid=25380) 
(ClientAppActor pid=25380) 🚀 Epoch 2/2 (100.00%) Completed
(ClientAppActor pid=25380) 	📊 Training Loss: 4.2007
(ClientAppActor pid=25380) 	✅ Training Accuracy: 6.25%
(ClientAppActor pid=25380) 	⏳ Elapsed Time: 1.37s | ETA: 0.00s
(ClientAppActor pid=25380) 	🕒 Completed At: 11:46
(ClientAppActor pid=25380) 
(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True
(ClientAppActor pid=25380) No prefix/name for the model was provided, choosen prefix/name: loopy_caterpie_56
(ClientAppActor pid=25380) 
(ClientAppActor pid=25380) 🚀 Epoch 1/2 (50.00%) Completed
(ClientAppActor pid=25380) 	📊 Training Loss: 4.4961
(ClientAppAc

[92mINFO [0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO [0m:      [Round 1] Avg Drift: 0.1745 | Relative Drift: 0.0003
[92mINFO [0m:      [Round 1] Saving aggregated model at epoch 3...


💾 Saved checkpoint at: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_dino_baseline\iid\fl_fl_baseline_BaseDino_epoch_3.pth
[Server Eval Round 1] Model device: cuda:0
[Server Eval Round 1] CUDA available in server eval: True


Eval progress: 100%|██████████| 313/313 [00:31<00:00,  9.79batch/s]
[92mINFO [0m:      [Round 1] Centralized Evaluation - Loss: 4.3370, Metrics: {'centralized_accuracy': 0.0589}
[92mINFO [0m:      fit progress: (1, 4.33696732810511, {'centralized_accuracy': 0.0589}, 95.60808470001211)
[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 100)


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  2.79batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.26batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.21batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.34batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.34batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.56batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.30batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.28batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.41batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 2]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 100)
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.34batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True
(ClientAppActor pid=25380) No prefix/name for the model was provided, choosen prefix/name: breezy_pidgey_56
(ClientAppActor pid=25380) 
(ClientAppActor pid=25380) 🚀 Epoch 1/2 (50.00%) Completed
(ClientAppActor pid=25380) 	📊 Training Loss: 4.4020
(ClientAppActor pid=25380) 	✅ Training Accuracy: 6.25%
(ClientAppActor pid=25380) 	⏳ Elapsed Time: 1.31s | ETA: 1.31s
(ClientAppActor pid=25380) 	🕒 Completed At: 11:48
(ClientAppActor pid=25380) 
(ClientAppActor pid=25380) 🚀 Epoch 2/2 (100.00%) Completed
(ClientAppActor pid=25380) 	📊 Training Loss: 4.2122
(ClientAppActor pid=25380) 	✅ Training Accuracy: 9.25%
(ClientAppActor pid=25380) 	⏳ Elapsed Time: 1.26s | ETA: 0.00s
(ClientAppActor pid=25380) 	🕒 Completed At: 11:48
(ClientAppActor pid=25380) 
(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: 

[92mINFO [0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO [0m:      [Round 2] Avg Drift: 0.1834 | Relative Drift: 0.0004
[92mINFO [0m:      [Round 2] Saving aggregated model at epoch 4...


💾 Saved checkpoint at: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_dino_baseline\iid\fl_fl_baseline_BaseDino_epoch_4.pth
[Server Eval Round 2] Model device: cuda:0
[Server Eval Round 2] CUDA available in server eval: True


Eval progress: 100%|██████████| 313/313 [00:32<00:00,  9.49batch/s]
[92mINFO [0m:      [Round 2] Centralized Evaluation - Loss: 4.1277, Metrics: {'centralized_accuracy': 0.0899}
[92mINFO [0m:      fit progress: (2, 4.127712368584288, {'centralized_accuracy': 0.0899}, 189.99000480002724)
[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 100)


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  2.96batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.16batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.13batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.44batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.24batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.16batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.06batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.09batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.08batch/s]


(ClientAppActor pid=25380) [Client] Client on device: cuda:0
(ClientAppActor pid=25380) [Client] CUDA available in client: True


Eval progress:   0%|          | 0/1 [00:00<?, ?batch/s]
[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
(ClientAppActor pid=25380)[92mINFO [0m:      [SUMMARY]
Eval progress: 100%|██████████| 1/1 [00:00<00:00,  3.10batch/s][92mINFO [0m:      Run finished 2 round(s) in 201.32s

[92mINFO [0m:      	History (loss, distributed):
[92mINFO [0m:      		round 1: 4.357826566696167
[92mINFO [0m:      		round 2: 4.157745313644409
[92mINFO [0m:      	History (loss, centralized):
[92mINFO [0m:      		round 0: 4.526353092620167
[92mINFO [0m:      		round 1: 4.33696732810511
[92mINFO [0m:      		round 2: 4.127712368584288
[92mINFO [0m:      	History (metrics, distributed, fit):
[92mINFO [0m:      	{'avg_drift': [(1, 0.17454500049352645), (2, 0.18344796150922776)],
[92mINFO [0m:      	 'avg_train_loss': [(1, 4.276829260587692), (2, 4.0683279693126675)]}
[92mINFO [0m:      	History (metrics, distributed, evaluate):
[92mINFO [0