# Build a strategy from scratch

In [1]:
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from datasets.utils.logging import disable_progress_bar

disable_progress_bar()

import flwr
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents
from flwr.server.strategy import Strategy
from flwr.simulation import run_simulation
from flwr_datasets import FederatedDataset

DEVICE = torch.device("cpu")  # Try "cuda" to train on GPU
print(f"Training on {DEVICE}")
print(f"Flower {flwr.__version__} / PyTorch {torch.__version__}")

Training on cpu
Flower 1.20.0 / PyTorch 2.7.1+cu126


## Data loading

In [2]:
def load_datasets(partition_id, num_partitions: int):
    fds = FederatedDataset(dataset="cifar10", partitioners={"train": num_partitions})
    partition = fds.load_partition(partition_id)
    # Divide data on each node: 80% train, 20% test
    partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
    pytorch_transforms = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )

    def apply_transforms(batch):
        # Instead of passing transforms to CIFAR10(..., transform=transform)
        # we will use this function to dataset.with_transform(apply_transforms)
        # The transforms object is exactly the same
        batch["img"] = [pytorch_transforms(img) for img in batch["img"]]
        return batch

    partition_train_test = partition_train_test.with_transform(apply_transforms)
    trainloader = DataLoader(partition_train_test["train"], batch_size=32, shuffle=True)
    valloader = DataLoader(partition_train_test["test"], batch_size=32)
    testset = fds.load_split("test").with_transform(apply_transforms)
    testloader = DataLoader(testset, batch_size=32)
    return trainloader, valloader, testloader

## Model training/evaluation

In [3]:
class Net(nn.Module):
    def __init__(self) -> None:
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]


def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)


def train(net, trainloader, epochs: int):
    """Train the network on the training set."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters())
    net.train()
    for epoch in range(epochs):
        correct, total, epoch_loss = 0, 0, 0.0
        for batch in trainloader:
            images, labels = batch["img"], batch["label"]
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = net(images)
            loss = criterion(net(images), labels)
            loss.backward()
            optimizer.step()
            # Metrics
            epoch_loss += loss
            total += labels.size(0)
            correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
        epoch_loss /= len(trainloader.dataset)
        epoch_acc = correct / total
        print(f"Epoch {epoch+1}: train loss {epoch_loss}, accuracy {epoch_acc}")


def test(net, testloader):
    """Evaluate the network on the entire test set."""
    criterion = torch.nn.CrossEntropyLoss()
    correct, total, loss = 0, 0, 0.0
    net.eval()
    with torch.no_grad():
        for batch in testloader:
            images, labels = batch["img"], batch["label"]
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = net(images)
            loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    loss /= len(testloader.dataset)
    accuracy = correct / total
    return loss, accuracy

## Flower Client

In [4]:
class FlowerClient(NumPyClient):
    def __init__(self, partition_id, net, trainloader, valloader):
        self.partition_id = partition_id
        self.net = net
        self.trainloader = trainloader
        self.valloader = valloader

    def get_parameters(self, config):
        print(f"[Client {self.partition_id}] get_parameters")
        return get_parameters(self.net)

    def fit(self, parameters, config):
        print(f"[Client {self.partition_id}] fit, config: {config}")
        set_parameters(self.net, parameters)
        train(self.net, self.trainloader, epochs=1)
        return get_parameters(self.net), len(self.trainloader), {}

    def evaluate(self, parameters, config):
        print(f"[Client {self.partition_id}] evaluate, config: {config}")
        set_parameters(self.net, parameters)
        loss, accuracy = test(self.net, self.valloader)
        return float(loss), len(self.valloader), {"accuracy": float(accuracy)}


def client_fn(context: Context) -> Client:
    net = Net().to(DEVICE)
    partition_id = context.node_config["partition-id"]
    num_partitions = context.node_config["num-partitions"]
    trainloader, valloader, _ = load_datasets(partition_id, num_partitions)
    return FlowerClient(partition_id, net, trainloader, valloader).to_client()


# Create the ClientApp
client = ClientApp(client_fn=client_fn)

### Testing

In [None]:
NUM_PARTITIONS = 10

# Create a list of ClientConfig objects, one for each client
client_configs = [{"partition_id": i} for i in range(NUM_PARTITIONS)]

# Specify the resources each of your clients need
# By default, each client will be allocated 1x CPU and 0x GPUs
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0},
                  "client_configs": client_configs  # Pass the client configurations
                 }
print(DEVICE)

# When running on GPU, assign an entire GPU for each client
if DEVICE == "cuda":
    backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 1.0},
                      "client_configs": client_configs  # Pass the client configurations
                     }
    print(DEVICE)
    # Refer to our Flower framework documentation for more details about Flower simulations
    # and how to set up the `backend_config`

In [6]:
# Create an instance of the model and get the parameters
params = get_parameters(Net())
from flwr.server.strategy import FedAvg
from flwr.common import ndarrays_to_parameters
def server_fn(context: Context) -> ServerAppComponents:
    # Create FedAvg strategy
    strategy = FedAvg(
        fraction_fit=0.3,
        fraction_evaluate=0.3,
        min_fit_clients=3,
        min_evaluate_clients=3,
        min_available_clients=NUM_PARTITIONS,
        initial_parameters=ndarrays_to_parameters(
            params
        ),  # Pass initial model parameters
    )

    # Configure the server for 3 rounds of training
    config = ServerConfig(num_rounds=3)
    return ServerAppComponents(strategy=strategy, config=config)

# Create ServerApp
server = ServerApp(server_fn=server_fn)

In [7]:
# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_PARTITIONS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=3, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      Evaluation returned no results (`None`)
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)


[36m(ClientAppActor pid=84151)[0m [Client 5] fit, config: {}


[36m(ClientAppActor pid=84151)[0m free(): double free detected in tcache 2
[36m(ClientAppActor pid=84151)[0m *** SIGABRT received at time=1755085790 on cpu 9 ***
[36m(ClientAppActor pid=84151)[0m PC: @     0x74142829eb2c  (unknown)  pthread_kill
[36m(ClientAppActor pid=84151)[0m     @     0x741428245330  897224192  (unknown)
[36m(ClientAppActor pid=84151)[0m     @     0x74142824527e         32  raise
[36m(ClientAppActor pid=84151)[0m     @     0x7414282288ff        192  abort
[36m(ClientAppActor pid=84151)[0m     @     0x7414282297b6        288  (unknown)
[36m(ClientAppActor pid=84151)[0m     @     0x7414282a8ff5         16  (unknown)
[36m(ClientAppActor pid=84151)[0m     @     0x7414282ab55f         80  (unknown)
[36m(ClientAppActor pid=84151)[0m     @     0x7414282addae         64  cfree
[36m(ClientAppActor pid=84151)[0m     @     0x7410fc915e45         48  (unknown)
[36m(ClientAppActor pid=84151)[0m     @     0x7410fc929889         48  (unknown)
[36m(ClientA

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff1862bc1288b65e2d36c998b801000000 Worker ID: 67f97b51ac6656b79edbc4610caecfa37378bf42765e52ed3545a418 Node ID: f02362524687d1d6771dcf5cefab620e1379578fc7a7a7b49d22184e Worker IP address: 172.30.170.62 Worker port: 45743 Worker PID: 84151 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[36m(ClientAppActor pid=84149)[0m [Client 1] fit, config: {}[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master

[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: edcf4613ab8d86609d330c7e01000000
	pid: 84150
	namespace: 4fe358c0-5938-4bc7-9c32-0a4c03d9cdd2
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffedcf4613ab8d86609d330c7e01000000 Worker ID: db03ceb05997869273596c0d6a5515ba1a1e7861f2f78e58959f7e84 Node ID: f02362524687d1d6771dcf5cefab620e1379578fc7a7a7b49d22184e Worker IP address: 172.30.170.62 Worker port: 33597 Worker PID: 84150 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: ccee1d849c99b7aa6b9da04501000000
	pid: 84149
	namespace: 4fe358c0-5938-4bc7-9c32-0a4c03d9cdd2
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffccee1d849c99b7aa6b9da04501000000 Worker ID: 88d509e48852087211b60fce240c5c8c5e0e51b3ba7abd921ee2fcd6 Node ID: f02362524687d1d6771dcf5cefab620e1379578fc7a7a7b49d22184e Worker IP address: 172.30.170.62 Worker port: 46699 Worker PID: 84149 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 1862bc1288b65e2d36c998b801000000
	pid: 84151
	namespace: 4fe358c0-5938-4bc7-9c32-0a4c03d9cdd2
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	acto

### Build a Strategy from scratch

In [8]:
from typing import Union

from flwr.common import (
    EvaluateIns,
    EvaluateRes,
    FitIns,
    FitRes,
    Parameters,
    Scalar,
    ndarrays_to_parameters,
    parameters_to_ndarrays,
)
from flwr.server.client_manager import ClientManager
from flwr.server.client_proxy import ClientProxy
from flwr.server.strategy.aggregate import aggregate, weighted_loss_avg


class FedCustom(Strategy):
    def __init__(
        self,
        fraction_fit: float = 1.0,
        fraction_evaluate: float = 1.0,
        min_fit_clients: int = 2,
        min_evaluate_clients: int = 2,
        min_available_clients: int = 2,
    ) -> None:
        super().__init__()
        self.fraction_fit = fraction_fit
        self.fraction_evaluate = fraction_evaluate
        self.min_fit_clients = min_fit_clients
        self.min_evaluate_clients = min_evaluate_clients
        self.min_available_clients = min_available_clients

    def __repr__(self) -> str:
        return "FedCustom"

    def initialize_parameters(
        self, client_manager: ClientManager
    ) -> Optional[Parameters]:
        """Initialize global model parameters."""
        net = Net()
        ndarrays = get_parameters(net)
        return ndarrays_to_parameters(ndarrays)

    def configure_fit(
        self, server_round: int, parameters: Parameters, client_manager: ClientManager
    ) -> List[Tuple[ClientProxy, FitIns]]:
        """Configure the next round of training."""

        # Sample clients
        sample_size, min_num_clients = self.num_fit_clients(
            client_manager.num_available()
        )
        clients = client_manager.sample(
            num_clients=sample_size, min_num_clients=min_num_clients
        )

        # Create custom configs
        n_clients = len(clients)
        half_clients = n_clients // 2
        standard_config = {"lr": 0.001}
        higher_lr_config = {"lr": 0.003}
        fit_configurations = []
        for idx, client in enumerate(clients):
            if idx < half_clients:
                fit_configurations.append((client, FitIns(parameters, standard_config)))
            else:
                fit_configurations.append(
                    (client, FitIns(parameters, higher_lr_config))
                )
        return fit_configurations

    def aggregate_fit(
        self,
        server_round: int,
        results: List[Tuple[ClientProxy, FitRes]],
        failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]],
    ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:
        """Aggregate fit results using weighted average."""

        weights_results = [
            (parameters_to_ndarrays(fit_res.parameters), fit_res.num_examples)
            for _, fit_res in results
        ]
        parameters_aggregated = ndarrays_to_parameters(aggregate(weights_results))
        metrics_aggregated = {}
        return parameters_aggregated, metrics_aggregated

    def configure_evaluate(
        self, server_round: int, parameters: Parameters, client_manager: ClientManager
    ) -> List[Tuple[ClientProxy, EvaluateIns]]:
        """Configure the next round of evaluation."""
        if self.fraction_evaluate == 0.0:
            return []
        config = {}
        evaluate_ins = EvaluateIns(parameters, config)

        # Sample clients
        sample_size, min_num_clients = self.num_evaluation_clients(
            client_manager.num_available()
        )
        clients = client_manager.sample(
            num_clients=sample_size, min_num_clients=min_num_clients
        )

        # Return client/config pairs
        return [(client, evaluate_ins) for client in clients]

    def aggregate_evaluate(
        self,
        server_round: int,
        results: List[Tuple[ClientProxy, EvaluateRes]],
        failures: List[Union[Tuple[ClientProxy, EvaluateRes], BaseException]],
    ) -> Tuple[Optional[float], Dict[str, Scalar]]:
        """Aggregate evaluation losses using weighted average."""

        if not results:
            return None, {}

        loss_aggregated = weighted_loss_avg(
            [
                (evaluate_res.num_examples, evaluate_res.loss)
                for _, evaluate_res in results
            ]
        )
        metrics_aggregated = {}
        return loss_aggregated, metrics_aggregated

    def evaluate(
        self, server_round: int, parameters: Parameters
    ) -> Optional[Tuple[float, Dict[str, Scalar]]]:
        """Evaluate global model parameters using an evaluation function."""

        # Let's assume we won't perform the global model evaluation on the server side.
        return None

    def num_fit_clients(self, num_available_clients: int) -> Tuple[int, int]:
        """Return sample size and required number of clients."""
        num_clients = int(num_available_clients * self.fraction_fit)
        return max(num_clients, self.min_fit_clients), self.min_available_clients

    def num_evaluation_clients(self, num_available_clients: int) -> Tuple[int, int]:
        """Use a fraction of available clients for evaluation."""
        num_clients = int(num_available_clients * self.fraction_evaluate)
        return max(num_clients, self.min_evaluate_clients), self.min_available_clients

In [9]:
def server_fn(context: Context) -> ServerAppComponents:
    # Configure the server for just 3 rounds of training
    config = ServerConfig(num_rounds=3)
    return ServerAppComponents(
        config=config,
        strategy=FedCustom(),  # <-- pass the new strategy here
    )


# Run simulation
run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_PARTITIONS,
    backend_config=backend_config,
)

[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=3, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      Evaluation returned no results (`None`)
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]
[92mINFO [0m:      configure_fit: strategy sampled 3 clients (out of 10)
[36m(ClientAppActor pid=86280)[0m free(): double free detected in tcache 2
[36m(ClientAppActor pid=86280)[0m *** SIGABRT received at time=1755085836 on cpu 14 ***
[36m(ClientAppActor pid=86280)[0m PC: @     0x7c838229eb2c  (unknown)  pthread_kill
[36m(ClientAppActor pid=86280)[0m     @     0x7c8382245330  (unknown)  (unknown)
[36m(ClientAppActor pid=86280)[0m     @     0x7c838224527e         32  raise
[36m(ClientAppActor pid=86280)[0m     @     0x7c83822288ff        192  abort
[36m(ClientAppActor pid=86280)[0m 

[36m(ClientAppActor pid=86280)[0m [Client 8] fit, config: {}


[36m(ClientAppActor pid=86277)[0m 
[36m(ClientAppActor pid=86277)[0m 
[36m(ClientAppActor pid=86278)[0m 
[36m(ClientAppActor pid=86278)[0m 
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 1c5b6e87c1170d4b13e8dae901000000
	pid: 86280
	namespace: 8de6765c-d80b-4af6-b87b-545c02fc6fc0
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/se

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff1c5b6e87c1170d4b13e8dae901000000 Worker ID: acfba6303ccaf193ece8ef755f7f7dc4795397d042eafb0b887eca86 Node ID: 84d9ee6386e9e0c77abbae6f06b0bbbaa74d617ab9d390619c98ae80 Worker IP address: 172.30.170.62 Worker port: 36609 Worker PID: 86280 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[36m(ClientAppActor pid=86278)[0m [Client 7] fit, config: {}[32m [repeated 2x across cluster][0m


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 95df4bba0d54fb31fcbce98901000000
	pid: 86277
	namespace: 8de6765c-d80b-4af6-b87b-545c02fc6fc0
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff95df4bba0d54fb31fcbce98901000000 Worker ID: 463cf56ac79b5cdabd192354281a13c08c2168fe6eb0a3613dad479b Node ID: 84d9ee6386e9e0c77abbae6f06b0bbbaa74d617ab9d390619c98ae80 Worker IP address: 172.30.170.62 Worker port: 37715 Worker PID: 86277 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 13e9320af6f205b265cc117a01000000
	pid: 86278
	namespace: 8de6765c-d80b-4af6-b87b-545c02fc6fc0
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     Traceback (most recent call last):
  File "/home/syed/miniconda3/envs/flwr/lib/python3.10/site-packages/flwr/server/superlink/fleet/vce/vce_api.py", line 112, in worker
    out_mssg, updated_context = backend.process_message(message, context)
  File "/home/sy

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff13e9320af6f205b265cc117a01000000 Worker ID: 775ae76f0ed390e81816fc7086aefce1ecb4b4093e95fd912047c631 Node ID: 84d9ee6386e9e0c77abbae6f06b0bbbaa74d617ab9d390619c98ae80 Worker IP address: 172.30.170.62 Worker port: 35353 Worker PID: 86278 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[92mINFO [0m:      aggregate_fit: received 0 results and 3 failures
[92mINFO [0m:      configure_evaluate: strategy sampled 3 clients (out of 10)
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     The actor died unexpectedly before finishing this task.
	class_name: ClientAppActor
	actor_id: 13e9320af6f205b265cc117a01000000
	pid: 86278
	namespace: 8de6765c-d80b-4af6-b87b-545c02fc6fc0
	ip: 172.30.170.62
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[91mERROR [0m:     An exception was raised when processing a message by RayBackend
[91mERROR [0m:     An exception was raised 