# Use a federated learning strategy
Welcome Back! to the Flower federated learning tutorial!

In this notebook, we’ll begin to customize the federated learning system we built in the introductory notebook again, using the Flower framework, Flower Datasets, and PyTorch.

## Step 0: Preparation

### Loading dependencies

In [1]:
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

import flwr
from flwr.client import Client, ClientApp, NumPyClient
from flwr.server import ServerApp, ServerConfig, ServerAppComponents
from flwr.server.strategy import FedAvg, FedAdagrad
from flwr.simulation import run_simulation
from flwr_datasets import FederatedDataset
from flwr.common import ndarrays_to_parameters, NDArrays, Scalar, Context

DEVICE = torch.device("cuda")  # Try "cuda" to train on GPU
print(f"Training on {DEVICE}")
print(f"Flower {flwr.__version__} / PyTorch {torch.__version__}")

  from .autonotebook import tqdm as notebook_tqdm
2025-08-12 15:26:54,257	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Training on cuda
Flower 1.20.0 / PyTorch 2.8.0+cu128


### Loading Data

In [2]:
NUM_PARTITIONS = 10
BATCH_SIZE = 32


def load_datasets(partition_id: int, num_partitions: int):
    fds = FederatedDataset(dataset="cifar10", partitioners={"train": num_partitions})
    partition = fds.load_partition(partition_id)
    # Divide data on each node: 80% train, 20% test
    partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
    pytorch_transforms = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )

    def apply_transforms(batch):
        # Instead of passing transforms to CIFAR10(..., transform=transform)
        # we will use this function to dataset.with_transform(apply_transforms)
        # The transforms object is exactly the same
        batch["img"] = [pytorch_transforms(img) for img in batch["img"]]
        return batch

    partition_train_test = partition_train_test.with_transform(apply_transforms)
    trainloader = DataLoader(
        partition_train_test["train"], batch_size=BATCH_SIZE, shuffle=True
    )
    valloader = DataLoader(partition_train_test["test"], batch_size=BATCH_SIZE)
    testset = fds.load_split("test").with_transform(apply_transforms)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    return trainloader, valloader, testloader

### Model training/evaluation

In [3]:
class Net(nn.Module):
    def __init__(self) -> None:
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]


def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)


def train(net, trainloader, epochs: int):
    """Train the network on the training set."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters())
    net.train()
    for epoch in range(epochs):
        correct, total, epoch_loss = 0, 0, 0.0
        for batch in trainloader:
            images, labels = batch["img"], batch["label"]
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = net(images)
            loss = criterion(net(images), labels)
            loss.backward()
            optimizer.step()
            # Metrics
            epoch_loss += loss
            total += labels.size(0)
            correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
        epoch_loss /= len(trainloader.dataset)
        epoch_acc = correct / total
        print(f"Epoch {epoch+1}: train loss {epoch_loss}, accuracy {epoch_acc}")


def test(net, testloader):
    """Evaluate the network on the entire test set."""
    criterion = torch.nn.CrossEntropyLoss()
    correct, total, loss = 0, 0, 0.0
    net.eval()
    with torch.no_grad():
        for batch in testloader:
            images, labels = batch["img"], batch["label"]
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = net(images)
            loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    loss /= len(testloader.dataset)
    accuracy = correct / total
    return loss, accuracy

### Define the Flower ClientApp

The first step toward creating a ClientApp is to implement a subclasses of flwr.client.Client or flwr.client.NumPyClient. We use NumPyClient in this tutorial because it is easier to implement and requires us to write less boilerplate. To implement NumPyClient, we create a subclass that implements the three methods get_parameters, fit, and evaluate:

get_parameters: Return the current local model parameters

fit: Receive model parameters from the server, train the model on the local data, and return the updated model parameters to the server

evaluate: Receive model parameters from the server, evaluate the model on the local data, and return the evaluation result to the server

In [4]:
class FlowerClient(NumPyClient):
    def __init__(self, partition_id, net, trainloader, valloader):
        self.partition_id = partition_id
        self.net = net
        self.trainloader = trainloader
        self.valloader = valloader

    def get_parameters(self, config):
        print(f"[Client {self.partition_id}] get_parameters")
        return get_parameters(self.net)

    def fit(self, parameters, config):
        print(f"[Client {self.partition_id}] fit, config: {config}")
        set_parameters(self.net, parameters)
        train(self.net, self.trainloader, epochs=1)
        return get_parameters(self.net), len(self.trainloader), {}

    def evaluate(self, parameters, config):
        print(f"[Client {self.partition_id}] evaluate, config: {config}")
        set_parameters(self.net, parameters)
        loss, accuracy = test(self.net, self.valloader)
        return float(loss), len(self.valloader), {"accuracy": float(accuracy)}


def client_fn(context: Context) -> Client:
    net = Net().to(DEVICE)

    # Read the node_config to fetch data partition associated to this node
    partition_id = context.node_config["partition-id"]
    num_partitions = context.node_config["num-partitions"]

    trainloader, valloader, _ = load_datasets(partition_id, num_partitions)
    return FlowerClient(partition_id, net, trainloader, valloader).to_client()


# Create the ClientApp
client = ClientApp(client_fn=client_fn)

## Strategy customization

### Server-side parameter initialization

In [5]:
# Create an instance of the model and get the parameters
params = get_parameters(Net())

In [6]:
def server_fn(context: Context) -> ServerAppComponents:
    # Create FedAvg strategy
    strategy = FedAvg(
        fraction_fit=0.3,
        fraction_evaluate=0.3,
        min_fit_clients=3,
        min_evaluate_clients=3,
        min_available_clients=NUM_PARTITIONS,
        initial_parameters=ndarrays_to_parameters(
            params
        ),  # Pass initial model parameters
    )

    # Configure the server for 3 rounds of training
    config = ServerConfig(num_rounds=3)
    return ServerAppComponents(strategy=strategy, config=config)

In [7]:
# Create ServerApp
server = ServerApp(server_fn=server_fn)

In [9]:
# Create a list of ClientConfig objects, one for each client
client_configs = [{"partition_id": i} for i in range(NUM_PARTITIONS)]

# Specify the resources each of your clients need
# By default, each client will be allocated 1x CPU and 0x GPUs
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0},
                      "client_configs": client_configs  # Pass the client configurations
                     }

# When running on GPU, assign an entire GPU for each client
if DEVICE == "cuda":
    backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 1.0},
                      "client_configs": client_configs  # Pass the client configurations
                     }
    # Refer to our Flower framework documentation for more details about Flower simulations
    # and how to set up the `backend_config`

# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_PARTITIONS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=3, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      Evaluation returned no results (`None`)
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)
[36m(ClientAppActor pid=43536)[0m free(): double free detected in tcache 2
[36m(ClientAppActor pid=43536)[0m *** SIGABRT received at time=1754983823 on cpu 10 ***
[36m(ClientAppActor pid=43536)[0m PC: @     0x7b847fe9eb2c  (unknown)  pthread_kill
[36m(ClientAppActor pid=43536)[0m     @     0x7b847fe45330  (unknown)  (unknown)
[36m(ClientAppActor pid=43536)[0m     @     0x7b847fe4527e         32  raise
[36m(ClientAppActor pid=43536)[0m     @     0x7b847fe288ff        192  abort
[36m(ClientAppActor pid=43536)[0m 

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff71920b4002a263f3be3747ac01000000 Worker ID: 5bdb96ee9991d1ee07fbfd1dbc31250036d75747b5d5236a507d1724 Node ID: f6bf7376cd0596f9f43c361f02d7d40dc466d7fae34e1cd842dcf471 Worker IP address: 172.30.170.62 Worker port: 35237 Worker PID: 43536 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 2ca271c4c3281e19cd8f9ee801000000
	pid: 43537
	namespace: 75f927c0-d138-47bb-994c-99369043e518
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff2fac4a35ce225ba3d1841ce501000000 Worker ID: a084e6da2cad19e8216af54a384a427ecf510df430202c412bb3ab70 Node ID: f6bf7376cd0596f9f43c361f02d7d40dc466d7fae34e1cd842dcf471 Worker IP address: 172.30.170.62 Worker port: 40021 Worker PID: 43538 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-dedup

[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 2fac4a35ce225ba3d1841ce501000000
	pid: 43538
	namespace: 75f927c0-d138-47bb-994c-99369043e518
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/

### Starting with a customized strategy

In [10]:
def server_fn(context: Context) -> ServerAppComponents:
    # Create FedAdagrad strategy
    strategy = FedAdagrad(
        fraction_fit=0.3,
        fraction_evaluate=0.3,
        min_fit_clients=3,
        min_evaluate_clients=3,
        min_available_clients=NUM_PARTITIONS,
        initial_parameters=ndarrays_to_parameters(params),
    )
    # Configure the server for 3 rounds of training
    config = ServerConfig(num_rounds=3)
    return ServerAppComponents(strategy=strategy, config=config)


# Create the ServerApp
server = ServerApp(server_fn=server_fn)

# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_PARTITIONS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=3, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      Evaluation returned no results (`None`)
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)
[36m(ClientAppActor pid=49675)[0m free(): double free detected in tcache 2
[36m(ClientAppActor pid=49675)[0m *** SIGABRT received at time=1754985234 on cpu 4 ***
[36m(ClientAppActor pid=49674)[0m PC: @     0x74d8fd89eb2c  (unknown)  pthread_kill
[36m(ClientAppActor pid=49675)[0m     @     0x7dd982a45330  1200500384  (unknown)
[36m(ClientAppActor pid=49675)[0m     @     0x7dd982a4527e         32  raise
[36m(ClientAppActor pid=49675)[0m     @     0x7dd982a288ff        192  abort
[36m(ClientAppActor pid=49675)[0m 

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff529b4d2434048c3bc28a469b01000000 Worker ID: a1e18a53cced6d2884973cea5756a86282b4da469981d263fb246659 Node ID: bc0e0f0df68077d479f807c51146128472547020d3edb614a677b305 Worker IP address: 172.30.170.62 Worker port: 35443 Worker PID: 49675 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 1761f6e84e7a052f81a9edab01000000
	pid: 49678
	namespace: d8ebe658-9f59-4dda-988c-4e0e41260ab7
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff1761f6e84e7a052f81a9edab01000000 Worker ID: 0aa64fe4b218bd8829d8334af52ef70eb15d399770fdb4652d7c9078 Node ID: bc0e0f0df68077d479f807c51146128472547020d3edb614a677b305 Worker IP address: 172.30.170.62 Worker port: 43411 Worker PID: 49678 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 87b79c62979f92bcebd0fa9701000000
	pid: 49674
	namespace: d8ebe658-9f59-4dda-988c-4e0e41260ab7
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff87b79c62979f92bcebd0fa9701000000 Worker ID: 4e5e80e843c1c17bd72336c1ca3869b2507c8fcd3673cfd33d235647 Node ID: bc0e0f0df68077d479f807c51146128472547020d3edb614a677b305 Worker IP address: 172.30.170.62 Worker port: 34333 Worker PID: 49674 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 529b4d2434048c3bc28a469b01000000
	pid: 49675
	namespace: d8ebe658-9f59-4dda-988c-4e0e41260ab7
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10

### Server-side parameter evaluation

In [11]:
# The `evaluate` function will be called by Flower after every round
def evaluate(
    server_round: int,
    parameters: NDArrays,
    config: Dict[str, Scalar],
) -> Optional[Tuple[float, Dict[str, Scalar]]]:
    net = Net().to(DEVICE)
    _, _, testloader = load_datasets(0, NUM_PARTITIONS)
    set_parameters(net, parameters)  # Update model with the latest parameters
    loss, accuracy = test(net, testloader)
    print(f"Server-side evaluation loss {loss} / accuracy {accuracy}")
    return loss, {"accuracy": accuracy}

In [12]:
def server_fn(context: Context) -> ServerAppComponents:
    # Create the FedAvg strategy
    strategy = FedAvg(
        fraction_fit=0.3,
        fraction_evaluate=0.3,
        min_fit_clients=3,
        min_evaluate_clients=3,
        min_available_clients=NUM_PARTITIONS,
        initial_parameters=ndarrays_to_parameters(params),
        evaluate_fn=evaluate,  # Pass the evaluation function
    )
    # Configure the server for 3 rounds of training
    config = ServerConfig(num_rounds=3)
    return ServerAppComponents(strategy=strategy, config=config)


# Create the ServerApp
server = ServerApp(server_fn=server_fn)

In [13]:
# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_PARTITIONS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=3, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      initial parameters (loss, other metrics): 0.07214355702400208, {'accuracy': 0.1}
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)


Server-side evaluation loss 0.07214355702400208 / accuracy 0.1


[36m(ClientAppActor pid=54633)[0m free(): double free detected in tcache 2
[36m(ClientAppActor pid=54633)[0m *** SIGABRT received at time=1754986241 on cpu 1 ***
[36m(ClientAppActor pid=54633)[0m PC: @     0x76681f69eb2c  (unknown)  pthread_kill
[36m(ClientAppActor pid=54633)[0m     @     0x76681f645330  254495344  (unknown)
[36m(ClientAppActor pid=54633)[0m     @     0x76681f64527e         32  raise
[36m(ClientAppActor pid=54633)[0m     @     0x76681f6288ff        192  abort
[36m(ClientAppActor pid=54633)[0m     @     0x76681f6297b6        288  (unknown)
[36m(ClientAppActor pid=54633)[0m     @     0x76681f6a8ff5         16  (unknown)
[36m(ClientAppActor pid=54633)[0m     @     0x76681f6ab55f         80  (unknown)
[36m(ClientAppActor pid=54633)[0m     @     0x76681f6addae         64  cfree
[36m(ClientAppActor pid=54633)[0m     @     0x7664b7715e45         48  (unknown)
[36m(ClientAppActor pid=54633)[0m     @     0x7664b7729889         48  (unknown)
[36m(ClientA

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffe2593d199eb5d8cbb86fb94501000000 Worker ID: d6bca6b62941d31e094998cd88ef7060bfe8b670fc4c155492364e44 Node ID: e150773a9c4cdb037a7098b3bf79c1902a90f407f93d62d69f3dae74 Worker IP address: 172.30.170.62 Worker port: 35897 Worker PID: 54632 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 4be77a62368ba46af030511e01000000
	pid: 54633
	namespace: a2e257e8-60de-4862-9b17-3932c75e731a
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff4be77a62368ba46af030511e01000000 Worker ID: 184d68fb9742009a006d511dce97179224b571ef336d7ef7f8f52349 Node ID: e150773a9c4cdb037a7098b3bf79c1902a90f407f93d62d69f3dae74 Worker IP address: 172.30.170.62 Worker port: 37633 Worker PID: 54633 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: f19c3045413d3dc5a3a50d4801000000
	pid: 54634
	namespace: a2e257e8-60de-4862-9b17-3932c75e731a
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: fffffffffffffffff19c3045413d3dc5a3a50d4801000000 Worker ID: f1e7e688901e60509787840207914c15ca3642d6171a3b35c11f1847 Node ID: e150773a9c4cdb037a7098b3bf79c1902a90f407f93d62d69f3dae74 Worker IP address: 172.30.170.62 Worker port: 45979 Worker PID: 54634 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[92mINFO [0m:      fit progress: (1, 0.07214355702400208, {'accuracy': 0.1}, 32.3603552329987)
[92mINFO [0m:      configure_evaluate: strategy sampled 3 clients (out of 10)
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: f19c3045413d3dc5a3a50d4801000000
	pid: 54634
	namespace: a2e257e8-60de-4862-9b17-3932c75e731a
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:  

Server-side evaluation loss 0.07214355702400208 / accuracy 0.1


[92mINFO [0m:      aggregate_evaluate: received 0 results and 3 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 2]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: f19c3045413d3dc5a3a50d4801000000
	pid: 54634
	namespace: a2e257e8-60de-4862-9b17-3932c75e731a
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force 

Server-side evaluation loss 0.07214355702400208 / accuracy 0.1


[92mINFO [0m:      aggregate_evaluate: received 0 results and 3 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 3]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: e2593d199eb5d8cbb86fb94501000000
	pid: 54632
	namespace: a2e257e8-60de-4862-9b17-3932c75e731a
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected 

Server-side evaluation loss 0.07214355702400208 / accuracy 0.1


[92mINFO [0m:      aggregate_evaluate: received 0 results and 3 failures
[92mINFO [0m:      
[92mINFO [0m:      [SUMMARY]
[92mINFO [0m:      Run finished 3 round(s) in 50.97s
[92mINFO [0m:      	History (loss, centralized):
[92mINFO [0m:      		round 0: 0.07214355702400208
[92mINFO [0m:      		round 1: 0.07214355702400208
[92mINFO [0m:      		round 2: 0.07214355702400208
[92mINFO [0m:      		round 3: 0.07214355702400208
[92mINFO [0m:      	History (metrics, centralized):
[92mINFO [0m:      	{'accuracy': [(0, 0.1), (1, 0.1), (2, 0.1), (3, 0.1)]}
[92mINFO [0m:      
[36m(ClientAppActor pid=54634)[0m free(): double free detected in tcache 2[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=54634)[0m *** SIGABRT received at time=1754986241 on cpu 2 ***[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=54634)[0m PC: @     0x70277dc9eb2c  (unknown)  pthread_kill[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=54634)[0m    

### Sending/receiving arbitrary values to/from clients

In [None]:
# Create a list of ClientConfig objects, one for each client
client_configs = [{"partition_id": i} for i in range(NUM_PARTITIONS)]

# Specify the resources each of your clients need
# By default, each client will be allocated 1x CPU and 0x GPUs
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0},
                      "client_configs": client_configs  # Pass the client configurations
                     }

# When running on GPU, assign an entire GPU for each client
if DEVICE == "cuda":
    backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 1.0},
                      "client_configs": client_configs  # Pass the client configurations
                     }
    # Refer to our Flower framework documentation for more details about Flower simulations
    # and how to set up the `backend_config`

In [None]:
def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics:
    # Multiply accuracy of each client by number of examples used
    accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics]
    examples = [num_examples for num_examples, _ in metrics]

    # Aggregate and return custom metric (weighted average)
    return {"accuracy": sum(accuracies) / sum(examples)}

In [None]:
def server_fn(context: Context) -> ServerAppComponents:
    """Construct components that set the ServerApp behaviour.

    You can use settings in `context.run_config` to parameterize the
    construction of all elements (e.g the strategy or the number of rounds)
    wrapped in the returned ServerAppComponents object.
    """

    # Create FedAvg strategy
    strategy = FedAvg(
        fraction_fit=1.0,
        fraction_evaluate=0.5,
        min_fit_clients=10,
        min_evaluate_clients=5,
        min_available_clients=10,
        evaluate_metrics_aggregation_fn=weighted_average,  # <-- pass the metric aggregation function
    )

    # Configure the server for 5 rounds of training
    config = ServerConfig(num_rounds=5)

    return ServerAppComponents(strategy=strategy, config=config)


# Create a new server instance with the updated FedAvg strategy
server = ServerApp(server_fn=server_fn)



# Run the simulation with the client_configs
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_CLIENTS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=5, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Requesting initial parameters from one random client
[36m(ClientAppActor pid=83682)[0m free(): double free detected in tcache 2
[36m(ClientAppActor pid=83682)[0m *** SIGABRT received at time=1754577165 on cpu 15 ***
[36m(ClientAppActor pid=83682)[0m PC: @     0x79325a89eb2c  (unknown)  pthread_kill
[36m(ClientAppActor pid=83682)[0m     @     0x79325a845330   81972576  (unknown)
[36m(ClientAppActor pid=83682)[0m     @     0x79325a84527e         32  raise
[36m(ClientAppActor pid=83682)[0m     @     0x79325a8288ff        192  abort
[36m(ClientAppActor pid=83682)[0m     @     0x79325a8297b6        288  (unknown)
[36m(ClientAppActor pid=83682)[0m     @     0x79325a8a8ff5         16  (unknown)
[36m(ClientAppActor pid=83682)[0m     @     0x79325a8ab55f         80  (unknown)
[36m(ClientAppActor pid=83682)[0m     @     

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff6bce59fa6c8f0707772cc5b101000000 Worker ID: 0433a45b68838a5d294850dcc1e125136ffc78c257eda703db08b793 Node ID: cbe4cf7afefa30bc958ef3f7f8055907a3dee3559c2030dd5dc7609e Worker IP address: 172.30.170.62 Worker port: 44485 Worker PID: 83682 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


RuntimeError: Exception in ServerApp thread