In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from pathlib import Path

import flwr
import torch
from flwr.common import Context
from flwr.simulation import run_simulation
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import Compose, Resize, CenterCrop, RandomCrop, RandomHorizontalFlip, Normalize, ToTensor

from fl_g13 import dataset as dataset_handler
from fl_g13.architectures import BaseDino
from fl_g13.config import RAW_DATA_DIR
from fl_g13.dataset import train_test_split
from fl_g13.fl_pytorch.server_app import get_server_app

[32m2025-05-03 09:43:41.654[0m | [1mINFO    [0m | [36mfl_g13.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13[0m


In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# DEVICE = "cpu"
print(f"Training on {DEVICE}")
print(f"Flower {flwr.__version__} / PyTorch {torch.__version__}")
# disable_progress_bar()

Training on cuda
Flower 1.17.0 / PyTorch 2.6.0+cu118


# Load data

In [5]:
# Define preprocessing pipeline
train_transform = Compose([
    Resize(256),  # CIFRA100 is originally 32x32
    RandomCrop(224),  # But Dino works on 224x224
    RandomHorizontalFlip(),
    ToTensor(),
    Normalize(mean=[0.5071, 0.4866, 0.4409], std=[0.2673, 0.2564, 0.2762]),
])

eval_transform = Compose([
    Resize(256),  # CIFRA100 is originally 32x32
    CenterCrop(224),  # But Dino works on 224x224
    ToTensor(),
    Normalize(mean=[0.5071, 0.4866, 0.4409], std=[0.2673, 0.2564, 0.2762]),
])

cifar100_train = datasets.CIFAR100(root=RAW_DATA_DIR, train=True, download=True, transform=train_transform)
cifar100_test = datasets.CIFAR100(root=RAW_DATA_DIR, train=False, download=True, transform=eval_transform)

train_dataset, val_dataset = train_test_split(cifar100_train, 0.8, random_state=None)
test_dataset = cifar100_test

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 40000
Validation dataset size: 10000
Test dataset size: 10000


In [6]:
# Dataloaders
BATCH_SIZE = 128
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
# I.I.D Sharding Split
## k client
k = 10
clients_dataset_train = dataset_handler.iid_sharding(train_dataset, k)
clients_dataset_val = dataset_handler.iid_sharding(val_dataset, k)

In [8]:
clients_dataloader_train = [DataLoader(d, batch_size=BATCH_SIZE, shuffle=True) for d in clients_dataset_train]
clients_dataloader_val = [DataLoader(d, batch_size=BATCH_SIZE, shuffle=True) for d in clients_dataset_val]

## Model

## Init model , optimizer and loss function

In [9]:
# Hyper-parameters
LR = 1e-2

# Model
model = BaseDino()
model.to(DEVICE)
print(f"Model: {model}")

# Optimizer, scheduler, and loss function
optimizer = SGD(model.parameters(), lr=LR)
scheduler = CosineAnnealingWarmRestarts(
    optimizer,
    T_0=8,  # First restart after 8 epochs
    T_mult=2,  # Double the interval between restarts each time
    eta_min=1e-5  # Minimum learning rate after annealing
)
criterion = CrossEntropyLoss()

Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_dino_main


Model: BaseDino(
  (net): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (attn_drop): Dropout(p=0.1, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.1, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (norm): Layer

# Define the ClientApp

## Build module local

Build module local such that ClientApp can use it

In [10]:
!pip install -e ..

Obtaining file:///C:/Users/ADMIN/Desktop/BACKUP/study/Italy/polito/classes/20242/deep%20learning/project/source_code/fl-g13
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: fl_g13
  Building editable for fl_g13 (pyproject.toml): started
  Building editable for fl_g13 (pyproject.toml): finished with status 'done'
  Created wheel for fl_g13: filename=fl_g13-0.0.1-py3-none-any.whl size=4649 sha256=a588eeccbccdbea56dbfe92e55bd612babd81e07366f39e361ccc99a5ffdec08
  Stored in directory: C:\Users\ADMIN\AppData\Loca


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## create FlowerClient instances  

In [11]:
'''
Function load data client is to simulate the distribution data into each client
In the real case, each client will have its dataset
'''


def load_data_client(context: Context,**kwargs):
    partition_id = context.node_config["partition-id"]
    print(f"Client {partition_id} is ready to train")
    return clients_dataloader_train[partition_id], clients_dataloader_val[partition_id]

### Create instant of ClientApp

In [12]:
from fl_g13.fl_pytorch.client_app import get_client_app

local_epochs = 2

client = get_client_app(load_data_fn=load_data_client,
                        model=model, optimizer=optimizer, criterion=criterion,
                        device=DEVICE,
                        local_epochs=local_epochs
                        )

# Define the Flower ServerApp

Customize built-in strategy Federated Averaging (FedAvg) of Flower to combine hyperparams in server-side and save model for each k epoch

The strategy could also incremental training 

## Create instant of ServerApp

In [13]:
def get_datatest_fn(context: Context):
    return test_dataloader


## checkpoints directory
current_path = Path.cwd()
model_test_path = current_path / "../models/fl_baseline"
model_test_path.resolve()

num_rounds = 2
save_every = 1
fraction_fit = 1.0  # Sample 100% of available clients for training
fraction_evaluate = 0.5  # Sample 50% of available clients for evaluation
min_fit_clients = 10  # Never sample less than 10 clients for training
min_evaluate_clients = 5  # Never sample less than 5 clients for evaluation
min_available_clients = 10  # Wait until all 10 clients are available
device = DEVICE
use_wandb = False

server = get_server_app(checkpoint_dir=model_test_path.resolve(),
                        model_class=BaseDino,
                        optimizer=optimizer,
                        criterion=criterion,
                        scheduler=scheduler,
                        get_datatest_fn=get_datatest_fn,
                        num_rounds=num_rounds,
                        fraction_fit=fraction_fit,
                        fraction_evaluate=fraction_evaluate,
                        min_fit_clients=min_fit_clients,
                        min_evaluate_clients=min_evaluate_clients,
                        min_available_clients=min_available_clients,
                        device=device,
                        use_wandb=use_wandb,
                        save_every=save_every
                        )

🔍 Loading checkpoint from C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_baseline\FL_BaseDino_epoch_1.pth
📦 Model class in checkpoint: BaseDino
🔧 Model configuration: {'variant': 'dino_vits16', 'dropout_rate': 0.1, 'head_hidden_size': 1024, 'head_layers': 5, 'num_classes': 100, 'unfreeze_blocks': 3, 'activation_fn': 'GELU', 'pretrained': True}


Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_dino_main
Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_dino_main


✅ Loaded checkpoint from C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_baseline\FL_BaseDino_epoch_1.pth, resuming at epoch 2


# Run the training


In [14]:
# Specify the resources each of your clients need
# By default, each client will be allocated 1x CPU and 0x GPUs
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0}}

# When running on GPU, assign an entire GPU for each client
if DEVICE == "cuda":
    backend_config["client_resources"] = {"num_cpus": 1, "num_gpus": 1}
    # Refer to our Flower framework documentation for more details about Flower simulations
    # and how to set up the `backend_config`

### Download missing module for clients

Dino model,that is serialized and sent to client by server, require some modules that have to download from source code of dino model


In [15]:
import os
import urllib.request


def download_if_not_exists(file_path: str, file_url: str):
    """
    Checks if a file exists at the given path. If it does not, downloads it from the specified URL.

    Parameters:
    - file_path (str): The local path to check and save the file.
    - file_url (str): The URL from which to download the file.
    """
    if not os.path.exists(file_path):
        print(f"'{file_path}' not found. Downloading from {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_path)
            print("Download complete.")
        except Exception as e:
            print(f"Failed to download file: {e}")
    else:
        print(f"'{file_path}' already exists.")

In [16]:
download_if_not_exists("vision_transformer.py",
                       "https://raw.githubusercontent.com/facebookresearch/dino/refs/heads/main/vision_transformer.py")
download_if_not_exists("utils.py",
                       "https://raw.githubusercontent.com/facebookresearch/dino/refs/heads/main/utils.py")


'vision_transformer.py' already exists.
'utils.py' already exists.


In [17]:
NUM_CLIENTS = 10

In [18]:
# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_CLIENTS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=2, no round_timeout
[92mINFO [0m:      


Continue train model from epoch 2


[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      ROUND 0💡 New best global model found: 0.009400
[92mINFO [0m:      initial parameters (loss, other metrics): 5.01113680948185, {'centralized_accuracy': 0.0094}
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)
[36m(ClientAppActor pid=24912)[0m 2025-04-20 22:52:12.922 | INFO     | fl_g13.config:<module>:11 - PROJ_ROOT path is: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13


[36m(ClientAppActor pid=24912)[0m Client 0 is ready to train
[36m(ClientAppActor pid=24912)[0m No prefix/name for the model was provided, choosen prefix/name: bouncy_ivysaur_68
[36m(ClientAppActor pid=24912)[0m 
[36m(ClientAppActor pid=24912)[0m 🚀 Epoch 1/3 (33.33%) Completed
[36m(ClientAppActor pid=24912)[0m 	📊 Training Loss: 5.1139
[36m(ClientAppActor pid=24912)[0m 	✅ Training Accuracy: 1.07%
[36m(ClientAppActor pid=24912)[0m 	⏳ Elapsed Time: 16.03s | ETA: 32.07s
[36m(ClientAppActor pid=24912)[0m 	🕒 Completed At: 22:52
[36m(ClientAppActor pid=24912)[0m 
[36m(ClientAppActor pid=24912)[0m 🚀 Epoch 2/3 (66.67%) Completed
[36m(ClientAppActor pid=24912)[0m 	📊 Training Loss: 5.0673
[36m(ClientAppActor pid=24912)[0m 	✅ Training Accuracy: 1.00%
[36m(ClientAppActor pid=24912)[0m 	⏳ Elapsed Time: 15.22s | ETA: 15.22s
[36m(ClientAppActor pid=24912)[0m 	🕒 Completed At: 22:52
[36m(ClientAppActor pid=24912)[0m 
[36m(ClientAppActor pid=24912)[0m 🚀 Epoch 3/3 (100.00%) 

[92mINFO [0m:      aggregate_fit: received 10 results and 0 failures


Saving centralized model epoch 2 aggregated_parameters...
💾 Saved checkpoint at: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_baseline\FL_BaseDino_epoch_2.pth


[92mINFO [0m:      ROUND 1💡 New best global model found: 0.011000
[92mINFO [0m:      fit progress: (1, 4.689738472805748, {'centralized_accuracy': 0.011}, 786.1971884000013)
[92mINFO [0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


[36m(ClientAppActor pid=24912)[0m Client 1 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 2 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 3 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 6 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 7 is ready to train


[92mINFO [0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 2]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=24912)[0m Client 0 is ready to train
[36m(ClientAppActor pid=24912)[0m No prefix/name for the model was provided, choosen prefix/name: grumpy_venusaur_71
[36m(ClientAppActor pid=24912)[0m 
[36m(ClientAppActor pid=24912)[0m 🚀 Epoch 1/3 (33.33%) Completed
[36m(ClientAppActor pid=24912)[0m 	📊 Training Loss: 4.8740
[36m(ClientAppActor pid=24912)[0m 	✅ Training Accuracy: 1.25%
[36m(ClientAppActor pid=24912)[0m 	⏳ Elapsed Time: 15.32s | ETA: 30.65s
[36m(ClientAppActor pid=24912)[0m 	🕒 Completed At: 23:05
[36m(ClientAppActor pid=24912)[0m 
[36m(ClientAppActor pid=24912)[0m 🚀 Epoch 2/3 (66.67%) Completed
[36m(ClientAppActor pid=24912)[0m 	📊 Training Loss: 4.8465
[36m(ClientAppActor pid=24912)[0m 	✅ Training Accuracy: 1.25%
[36m(ClientAppActor pid=24912)[0m 	⏳ Elapsed Time: 15.31s | ETA: 15.31s
[36m(ClientAppActor pid=24912)[0m 	🕒 Completed At: 23:06
[36m(ClientAppActor pid=24912)[0m 
[36m(ClientAppActor pid=24912)[0m 🚀 Epoch 3/3 (100.00%)

[92mINFO [0m:      aggregate_fit: received 10 results and 0 failures


Saving centralized model epoch 3 aggregated_parameters...
💾 Saved checkpoint at: C:\Users\ADMIN\Desktop\BACKUP\study\Italy\polito\classes\20242\deep learning\project\source_code\fl-g13\models\fl_baseline\FL_BaseDino_epoch_3.pth


[92mINFO [0m:      ROUND 2💡 New best global model found: 0.018000
[92mINFO [0m:      fit progress: (2, 4.599909758266015, {'centralized_accuracy': 0.018}, 1594.3396579)
[92mINFO [0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


[36m(ClientAppActor pid=24912)[0m Client 3 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 4 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 5 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 7 is ready to train
[36m(ClientAppActor pid=24912)[0m Client 8 is ready to train


[92mINFO [0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [SUMMARY]
[92mINFO [0m:      Run finished 2 round(s) in 1618.09s
[92mINFO [0m:      	History (loss, distributed):
[92mINFO [0m:      		round 1: 5.153822147846222
[92mINFO [0m:      		round 2: 4.96987875699997
[92mINFO [0m:      	History (loss, centralized):
[92mINFO [0m:      		round 0: 5.01113680948185
[92mINFO [0m:      		round 1: 4.689738472805748
[92mINFO [0m:      		round 2: 4.599909758266015
[92mINFO [0m:      	History (metrics, distributed, evaluate):
[92mINFO [0m:      	{'federated_evaluate_accuracy': [(1, 0.0086), (2, 0.0116)]}
[92mINFO [0m:      	History (metrics, centralized):
[92mINFO [0m:      	{'centralized_accuracy': [(0, 0.0094), (1, 0.011), (2, 0.018)]}
[92mINFO [0m:      
