In [1]:
!pip install -q lightning flwr wandb hydra-core

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.0/819.0 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.0/540.0 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.4/242.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.0/236.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   

In [2]:
from torch.utils.data import Dataset
from pathlib import Path
import nibabel as nib
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

In [3]:
from kaggle_secrets import UserSecretsClient
secret_label = "wandb api key"
WANDB_APIKEY = UserSecretsClient().get_secret(secret_label)

ROOT_PATH = '/kaggle/input/mri-dataset/datasetzip/not_skull_stripped'


# Dataset

In [4]:
class MRIDataset(Dataset) :

    def __init__(self, root_dir: str, label_path: str = None, transform = None, label_df: pd.DataFrame = None, is_3d: bool = False):
        self.root_dir = Path(root_dir)
        self.transform = transform
        self.is_3d = is_3d
        if label_df is None:
          self.labels_df = pd.read_csv(label_path)
          
        else :
          self.labels_df = label_df

        self.labels_df['subject_id'] = self.labels_df['subject_id'].astype(str)
        self.labels_df = self.labels_df[self.labels_df['subject_dx'] == 'control']

        all_nii_files = list(self.root_dir.rglob("*.nii"))
        fail_paths = ["sub-BrainAge005600/anat/sub-BrainAge005600_T1w.nii/sub-BrainAge005600_T1w.nii"]
        self.file_paths = [fp for fp in all_nii_files if fp.is_file() and fp.name not in fail_paths ]

        valid_subjects = set(self.labels_df['subject_id'].values)

        self.file_paths = [fp for fp in self.file_paths if any(vs in str(fp) for vs in valid_subjects)]
        self.file_paths.sort()



    def __len__(self):
        return len(self.file_paths)


    def preprocessing_datapoint(self, img_data):

        mid_x = img_data.shape[0] // 2
        mid_y = img_data.shape[1] // 2
        mid_z = img_data.shape[2] // 2

        axial_slice = img_data[:, :, mid_z]
        coronal_slice = img_data[:, mid_y, :]
        sagittal_slice = img_data[mid_x, :, :]


        combined_data = np.stack([axial_slice, coronal_slice, sagittal_slice], axis=0)
        combined_data = torch.from_numpy(combined_data).float()

        if self.transform : combined_data = self.transform(combined_data)

        return combined_data




    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        file_path_str = str(img_path)

        subject_id = None
        valid_subjects_set = set(self.labels_df['subject_id'].values)


        for sid in valid_subjects_set:
            if sid in file_path_str:
                subject_id = sid
                break

        if subject_id is None:
            raise ValueError(f"Không tìm thấy subject_id cho file: {img_path}")

        metadata = self.labels_df.loc[self.labels_df['subject_id'] == subject_id].iloc[0].to_dict()

        img_data = nib.load(img_path).get_fdata()

        img_data = torch.from_numpy(img_data).float()

        label = 0
        if metadata['subject_sex'] == 'm' : label = 1

        if not self.is_3d:
            img_data = self.preprocessing_datapoint(img_data)

        return img_data,  label



def visualize_sample(dataset, idx):
    mri_data, label = dataset[idx]
    title = f"Label: {label}\n"
    plt.close('all')
    fig = plt.figure(figsize = (18, 6))

    if isinstance(mri_data, torch.Tensor):
        data = mri_data.squeeze().numpy()
    else:
        data = mri_data


    ax1 = fig.add_subplot(1, 3, 1)
    plt.imshow(data[0, :, :].T, cmap='gray', origin='lower')

    ax2 = fig.add_subplot(1, 3, 2)
    ax2.imshow(data[1, :, :].T, cmap='gray', origin='lower')

    ax3 = fig.add_subplot(1, 3, 3)
    ax3.imshow(data[2, :, :].T, cmap='gray', origin='lower')

    plt.suptitle(title)
    plt.tight_layout()
    plt.show()


In [5]:
dataset = MRIDataset(root_dir= '/kaggle/input/mini-brain3d-dataset/not_skull_stripped' , label_path = '/kaggle/input/mri-label/label.csv', is_3d = True)

In [6]:
dataset[0][0].shape 

torch.Size([130, 130, 130])

## Data splitting

In [7]:
from torch.utils.data import random_split 

def preprocessing_labels(df: pd.DataFrame, root_dir: str = ROOT_PATH):
    
    subject_list = []
    for root, dirs, files in os.walk(root_dir):
      for dir_name in dirs:
        if dir_name.startswith("sub-BrainAge"):
            subject_list.append(dir_name)


    return df[df['subject_id'].isin(subject_list)]


def prepare_data(data: pd.DataFrame):

  df = data.copy()
  df['age_group'] = pd.qcut(df['subject_age'], q = min(5, len(df)), labels = False)
  df['key'] = df.apply(lambda row : f"{row['age_group']}_{row['subject_sex']}", axis = 1)
  return df


def sampling_data(data, size, random_state ):

  samples = data.groupby('key', group_keys = False)


  samples = samples.apply(lambda x: x.sample(
      n = min(int(size / len(data['key'].unique())), len(x)),
      replace = len(x) < int(size / len(data['key'].unique())),
      random_state =  random_state
  ), include_groups=False)


  if len(samples) < size:
    additional_samples = data.drop(samples.index).sample(
        n = min(size - len(samples), len(data) - len(samples)),
        replace = True,
        random_state = random_state
    )

    samples = pd.concat([samples, additional_samples])
  return samples


def create_train_test(sample_labels: list, val_ratio: float = 0.2, root_dir: str = ROOT_PATH, is_3d: bool = False):

  client_datasets = []
  for label_df in sample_labels:
    dataset = MRIDataset(root_dir=root_dir, label_df = label_df, is_3d = is_3d)
    
    train_dataset, val_dataset = random_split(dataset, [1 - val_ratio, val_ratio])
    client_datasets.append((train_dataset, val_dataset))
  return client_datasets


def distributed_data_to_clients(data: pd.DataFrame, num_clients: int, overlap_ratio: float):

  df = prepare_data(data)

  n_samples = len(df)
  samples_per_client = int(n_samples / (num_clients * (1 - overlap_ratio) + overlap_ratio))

  client_datasets = []
  selected_samples = {}

  # Tạo các client datasets với sự phân bố cân bằng
  for client_idx in range(num_clients):

      if client_idx == 0:
          client_data = df.sample(n=samples_per_client, random_state=42+client_idx)
      else:
          # overlap size
          overlap_size = int(samples_per_client * overlap_ratio)
          non_overlap_size = samples_per_client - overlap_size

          # building overlap
          all_previous_samples = pd.DataFrame()
          for prev_client_idx in range(client_idx):
              all_previous_samples = pd.concat([all_previous_samples, selected_samples[prev_client_idx]])

          # sampling
          if len(all_previous_samples) > 0:
              overlap_samples = sampling_data(all_previous_samples, overlap_size, client_idx * 100 + 42)
          else:
              overlap_samples = pd.DataFrame(columns=df.columns)

          # Lấy mẫu mới (không overlap)
          remaining_indices = df.index.difference(all_previous_samples.index)
          if len(remaining_indices) > 0:
              remaining_df = df.loc[remaining_indices]
              non_overlap_samples = sampling_data(remaining_df, non_overlap_size, client_idx * 100 + 42)
          else:

              non_overlap_samples = df.sample(n=non_overlap_size, replace=True, random_state=42+client_idx*300)


          client_data = pd.concat([overlap_samples, non_overlap_samples])


      selected_samples[client_idx] = client_data
      client_datasets.append(client_data.drop(['age_group', 'key'], axis=1))

  return client_datasets

In [8]:


def iid_client_split(dataset, num_client = 3,  val_ratio = 0.2):

    client_datasets = []
    sample_per_client = len(dataset) // num_client


    for i in range(num_client):
        start_idx = i * sample_per_client
        end_idx = (i + 1) * sample_per_client if i < num_client - 1 else len(dataset)
        indecies = list(range(start_idx, end_idx))

        client_dataset = torch.utils.data.Subset(dataset, indecies)
        train_dataset, val_dataset = random_split(client_dataset, [1 - val_ratio, val_ratio])

        client_datasets.append((train_dataset, val_dataset))
    return client_datasets





def same_distribution_client_split(dataset, num_client, val_ratio = 0.2, overlap_ratio = 0.2, root_dir = ROOT_PATH, is_3d = False):
    """
    Split the dataset into clients with the same distribution of labels.
    """
    labels_df = dataset.labels_df
    labels_df = preprocessing_labels(labels_df, root_dir = root_dir)    
    labels_df = prepare_data(labels_df)

    client_datasets = distributed_data_to_clients(labels_df, num_clients=num_client, overlap_ratio=overlap_ratio)

    client_datasets = create_train_test(client_datasets, val_ratio=val_ratio, root_dir=root_dir, is_3d = is_3d)

    return client_datasets


#  Model

## 3D DenseNet

In [9]:
import torch.nn as nn 
import torch 
import torch.nn.functional as F
from collections import OrderedDict
from typing import List, Tuple
from torchsummary import summary


class _DenseLayer(nn.Sequential):
    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
        super().__init__()
        self.add_module('norm1', nn.BatchNorm3d(num_input_features))
        self.add_module('relu1', nn.ReLU(inplace=True))
        self.add_module(
            'conv1',
            nn.Conv3d(num_input_features,
                      bn_size * growth_rate,
                      kernel_size=1,
                      stride=1,
                      bias=False))
        self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate))
        self.add_module('relu2', nn.ReLU(inplace=True))
        self.add_module(
            'conv2',
            nn.Conv3d(bn_size * growth_rate,
                      growth_rate,
                      kernel_size=3,
                      stride=1,
                      padding=1,
                      bias=False))
        self.drop_rate = drop_rate

    def forward(self, x):
        new_features = super().forward(x)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features,
                                     p=self.drop_rate,
                                     training=self.training)
        return torch.cat([x, new_features], 1)

class _DenseBlock(nn.Sequential):
    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
        super().__init__()
        for i in range(num_layers):
            layer = _DenseLayer(num_input_features + i * growth_rate,
                                growth_rate, bn_size, drop_rate)
            self.add_module('denselayer{}'.format(i + 1), layer)

class _Transition(nn.Sequential):
    def __init__(self, num_input_features, num_output_features):
        super().__init__()
        self.add_module('norm', nn.BatchNorm3d(num_input_features))
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module(
            'conv',
            nn.Conv3d(num_input_features,
                      num_output_features,
                      kernel_size=1,
                      stride=1,
                      bias=False))
        self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))

class DenseNet(nn.Module):
    def __init__(self,
                 n_input_channels=1,
                 conv1_t_size=7,
                 conv1_t_stride=1,
                 no_max_pool=False,
                 growth_rate=16,
                 block_config=(4, 8, 16, 12),
                 num_init_features=32,
                 bn_size=4,
                 drop_rate=0,
                 num_classes=1):
        super().__init__()

        # First convolution
        self.features = [('conv1',
                          nn.Conv3d(n_input_channels,
                                    num_init_features,
                                    kernel_size=(conv1_t_size, 7, 7),
                                    stride=(conv1_t_stride, 2, 2),
                                    padding=(conv1_t_size // 2, 3, 3),
                                    bias=False)),
                         ('norm1', nn.BatchNorm3d(num_init_features)),
                         ('relu1', nn.ReLU(inplace=True))]
        if not no_max_pool:
            self.features.append(
                ('pool1', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)))
        self.features = nn.Sequential(OrderedDict(self.features))

        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(num_layers=num_layers,
                                num_input_features=num_features,
                                bn_size=bn_size,
                                growth_rate=growth_rate,
                                drop_rate=drop_rate)
            self.features.add_module('denseblock{}'.format(i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = _Transition(num_input_features=num_features,
                                    num_output_features=num_features // 2)
                self.features.add_module('transition{}'.format(i + 1), trans)
                num_features = num_features // 2

        self.features.add_module('norm5', nn.BatchNorm3d(num_features))
        self.classifier = nn.Linear(num_features, num_classes)

        # Khởi tạo trọng số
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool3d(out, output_size=(1, 1, 1)).view(features.size(0), -1)
        logits = self.classifier(out)
        return logits
    

In [10]:
import lightning as pl
import torch.nn as nn 
from torchmetrics import Accuracy, F1Score, Precision, Recall, MeanMetric
import torch.optim as optim 
import torch 



class DenseNetModule(pl.LightningModule):
    def __init__(self, net, learning_rate=1e-3, weight_decay = 1e-2, batch_size = 32):
        super().__init__()
        self.model = net
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.batch_size = batch_size
        # how confidence model is in it prediction
        # tức model có thể rất tự tin trong quyết định nhưng thực tế lại sai
        # BCE = y*log(y_pred) + (1 - y)*log(1 - y_pred)
        self.criterion = nn.BCEWithLogitsLoss()
        
        self.train_accuracy = Accuracy(task="binary", num_classes=1)
        self.val_accuracy = Accuracy(task="binary", num_classes=1)
        self.test_accuracy = Accuracy(task="binary", num_classes=1)


        self.val_precision = Precision(task="binary", num_classes=1)
        self.test_precision = Precision(task="binary", num_classes=1)

        self.val_recall = Recall(task="binary", num_classes=1)
        self.test_recall = Recall(task="binary", num_classes=1)

        self.val_f1 = F1Score(task="binary", num_classes=1)
        self.test_f1 = F1Score(task="binary", num_classes=1)

    

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        # x.shape  = (batch_size, in_channel, height, width, depth), y.shape = (batch_size)
        logits = self(x.unsqueeze(1))
        
        loss = self.criterion(logits, y.float().unsqueeze(1))
        
        acc = self.train_accuracy((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))

        
        self.log('train/loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train/acc', acc, on_step=False, on_epoch=True, prog_bar=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x.unsqueeze(1))
        
        loss = self.criterion(logits, y.float().unsqueeze(1))
        acc = self.val_accuracy((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        f1 = self.val_f1((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        precision = self.val_precision((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        recall = self.val_recall((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        
        self.log('val/loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val/acc', acc, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val/f1', f1, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val/precision', precision, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val/recall', recall, on_step=False, on_epoch=True, prog_bar=True)

        return loss



    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x.unsqueeze(1))
        
        loss = self.criterion(logits, y.float().unsqueeze(1))
        acc = self.test_accuracy((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        f1 = self.test_f1((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        precision = self.test_precision((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))
        recall = self.test_recall((torch.sigmoid(logits) > 0.5).float(), y.unsqueeze(1))

        



        self.log('test/loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('test/acc', acc, on_step=False, on_epoch=True, prog_bar=True)
        self.log('test/f1', f1, on_step=False, on_epoch=True, prog_bar=True)
        self.log('test/precision', precision, on_step=False, on_epoch=True, prog_bar=True)
        self.log('test/recall', recall, on_step=False, on_epoch=True, prog_bar=True)
        return loss




    def configure_optimizers(self):
        optimizer =  torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay = self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.95)

        return {
           "optimizer": optimizer,
           "lr_scheduler": {
               "scheduler": scheduler,
               "monitor": "val_loss",
           },
        }


## 2D model 

In [11]:
import torch
import torch.nn as nn
import torchvision.models as models

class BrainMRINet(nn.Module):
    def __init__(self, num_classes: int = 2, pretrained: bool = True):
        super(BrainMRINet, self).__init__()
        # Load DenseNet-121
        densenet = models.densenet121(weights='IMAGENET1K_V1')

        # Replace the classifier (classifier is a single Linear layer in DenseNet)
        num_features = densenet.classifier.in_features
        densenet.classifier = nn.Sequential(
            nn.Linear(num_features, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

        self.model = densenet

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

In [12]:
class BrainMRILightningModule(pl.LightningModule): 
    
    def __init__(self, net: nn.Module,  learning_rate=1e-3, weight_decay = 1e-2, batch_size = 32 ):
        super().__init__()

        self.save_hyperparameters(ignore=['net'])

        self.model = net 
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.batch_size = batch_size


        self.train_acc = Accuracy(task= "multiclass",  num_classes=net.model.classifier[-1].out_features)
        self.val_acc = Accuracy(task= "multiclass", num_classes=net.model.classifier[-1].out_features)
        
        self.test_acc = Accuracy(task= "multiclass",  num_classes=net.model.classifier[-1].out_features)
        
        # F1 score 
        self.val_f1 = F1Score(task="multiclass", num_classes=net.model.classifier[-1].out_features)
        self.test_f1 = F1Score(task="multiclass", num_classes=net.model.classifier[-1].out_features)
        # Precision
        self.val_precision = Precision(task="multiclass", num_classes=net.model.classifier[-1].out_features)   
        self.test_precision = Precision(task="multiclass", num_classes=net.model.classifier[-1].out_features)
        # Recall
        self.val_recall = Recall(task="multiclass", num_classes=net.model.classifier[-1].out_features)
        self.test_recall = Recall(task="multiclass", num_classes=net.model.classifier[-1].out_features)

        self.criterion = nn.CrossEntropyLoss()


    def forward(self, x):
        return self.model(x)

    def configure_optimizers(self):
        optimizer =  optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
        return {
            "optimizer": optimizer,
            "gradient_clip_val": 1.0,  # Adjust value as needed
        }
    

    def training_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x)
        
        loss = nn.CrossEntropyLoss()(outputs, y)
        preds = torch.argmax(outputs, dim=1)
        
        # Update metrics
        acc = self.train_acc(preds, y)
        

        self.log("train/loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        
        return loss
    

    def validation_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x)
        
        loss = self.criterion(outputs, y)
        preds = torch.argmax(outputs, dim=1)
        
        # Update metrics
       
        acc =  self.val_acc(preds, y)
        f1 =  self.val_f1(preds, y)
        precision =  self.val_precision(preds, y)
        recall =  self.val_recall(preds, y)
        
        # Log metrics
        self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val/f1", f1, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val/precision", precision, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val/recall", recall, on_step=False, on_epoch=True, prog_bar=True)

        return loss
    

    def test_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x)
        
        loss = self.criterion(outputs, y)
        preds = torch.argmax(outputs, dim=1)
        
        # Update metrics
        acc = self.test_acc(preds, y)
        f1 = self.test_f1(preds, y)
        precision = self.test_precision(preds, y)
        recall = self.test_recall(preds, y)
        
        # Log metrics
        self.log("test/loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test/f1", f1, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test/precision", precision, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test/recall", recall, on_step=False, on_epoch=True, prog_bar=True)

        return loss

# Client

In [13]:
import torch 
from collections import OrderedDict
from flwr.client import NumPyClient
from flwr.common import  Context
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn as nn 
import logging 
import lightning as pl
import warnings 
from lightning.pytorch.loggers.wandb import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint

2025-06-03 09:05:56.154917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748941556.365704      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748941556.431894      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]


def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.from_numpy(np.copy(v)) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)

In [15]:
class FlowerLightningClient(NumPyClient):


    def __init__(self, model: pl.LightningModule, train_dataloader, val_dataloader, epochs, batch_size, device, client_id): 

        self.model = model
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.epochs = epochs
        self.device = device 
        self.client_id = client_id
        self.batch_size = batch_size
    
    def get_parameters(self, config):

        return [val.cpu().numpy() for _, val in self.model.state_dict().items()]

    def set_parameters(self, parameters):
        if not parameters:
            return

        params_dict = zip(self.model.state_dict().keys(), parameters)
        state_dict = OrderedDict()
        for k, v in params_dict:
            state_dict[k] = torch.tensor(v)
            

        if state_dict:
            self.model.load_state_dict(state_dict, strict=False)

        
    def fit(self, parameters, config):

        self.set_parameters(parameters)
        
        checkpoint_callback = ModelCheckpoint(
            dirpath=f"./checkpoints/client_{self.client_id}",
            filename=f"round_{config.get('round_num', 0)}" + "-{epoch:02d}",
            save_top_k=1,
            monitor="val/loss",
            mode="min"
        )

        trainer = pl.Trainer(
            max_epochs=self.epochs,
            accelerator="auto",
            devices=1,
            callbacks=[checkpoint_callback],
            enable_progress_bar=False, 
            log_every_n_steps=1
        )

        trainer.fit(self.model, train_dataloaders=self.train_dataloader, val_dataloaders=self.val_dataloader)

        callback_metrics = trainer.callback_metrics

        train_loss = callback_metrics.get("train/loss", 0)
        train_accuracy = callback_metrics.get("train/acc", 0)
        val_loss = callback_metrics.get("val/loss", 0)
        val_accuracy = callback_metrics.get("val/acc", 0)
        val_precision = callback_metrics.get("val/precision", 0)
        val_recall = callback_metrics.get("val/recall", 0)
        val_f1 = callback_metrics.get("val/f1", 0)



        metrics = {
            "train_loss": train_loss.item() if isinstance(train_loss, torch.Tensor) else float(train_loss),
            "train_accuracy": train_accuracy.item() if isinstance(train_accuracy, torch.Tensor) else float(train_accuracy),
            "val_loss": val_loss.item() if isinstance(val_loss, torch.Tensor) else float(val_loss),
            "val_accuracy": val_accuracy.item() if isinstance(val_accuracy, torch.Tensor) else float(val_accuracy),
            "val_precision": val_precision.item() if isinstance(val_precision, torch.Tensor) else float(val_precision),
            "val_recall": val_recall.item() if isinstance(val_recall, torch.Tensor) else float(val_recall),
            "val_f1": val_f1.item() if isinstance(val_f1, torch.Tensor) else float(val_f1)
        }



        return self.get_parameters(config={}), len(self.train_dataloader.dataset), metrics


    def evaluate(self, parameters, config):

        self.set_parameters(parameters)

       
        trainer = pl.Trainer(
            accelerator="auto",
            devices=1 ,
            enable_progress_bar=False
        )

        results = trainer.test(self.model, dataloaders=self.val_dataloader)
        
        callback_metrics = trainer.callback_metrics

        test_loss = callback_metrics.get("test/loss", 0)
        test_accuracy = callback_metrics.get("test/acc", 0)
        test_f1 = callback_metrics.get("test/f1", 0)
        test_precision = callback_metrics.get("test/precision", 0)
        test_recall = callback_metrics.get("test/recall", 0)
        
        # Additional metrics
        metrics = {
            "test_loss": test_loss.item() if isinstance(test_loss, torch.Tensor) else float(test_loss),
            "test_accuracy": test_accuracy.item() if isinstance(test_accuracy, torch.Tensor) else float(test_accuracy),
            "test_f1": test_f1.item() if isinstance(test_f1, torch.Tensor) else float(test_f1),
            "test_precision": test_precision.item() if isinstance(test_precision, torch.Tensor) else float(test_precision),
            "test_recall": test_recall.item() if isinstance(test_recall, torch.Tensor) else float(test_recall)
        }

       
        return float(test_loss), len(self.val_dataloader.dataset), metrics


In [16]:
def create_lightning_client_fn(device, epochs, client_datasets, batch_size, num_workers, pl_model):

    def client_fn(context: Context):
        
        
        client_id = context.node_config['partition-id']
        train_dataset, val_dataset = client_datasets[client_id]

        train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers = num_workers)
        val_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False, num_workers = num_workers)
        return FlowerLightningClient(pl_model, train_dataloader, val_dataloader, epochs, batch_size, device, client_id).to_client()

    return client_fn

In [17]:
import random
import numpy as np
from typing import Dict, List, Optional, Tuple, Union
import flwr as fl
from flwr.common import (
    EvaluateRes,
    FitIns,
    FitRes,
    Parameters,
    EvaluateIns,
)
from flwr.server.client_proxy import ClientProxy
from flwr.server.client_manager import ClientManager
from flwr.server.strategy import FedAvg
from flwr.common.parameter import parameters_to_ndarrays
from flwr.common import ndarrays_to_parameters
import numpy as np 
import torch
from collections import OrderedDict
import wandb


In [18]:
class FedAR(FedAvg):
    def __init__(
        self, 
        net, 
        dropout_rate_training: float = 0.3, 
        dropout_rate_eval: float = 0.3, 
        fixed_clients: Optional[List[int]] = None, 
        dropout_pattern_train: str = "random", 
        dropout_pattern_eval: str = "random",
        eta: float = 0.1,       # Server learning rate (η)
        rho: float = 0.1,        # Staleness exponent (ρ)
        psi_max: float = 2.0,    # Max weight (ψ_max)
        t0: float = 1.0,     
        b: float = 3.0,      # (γ)
        initial_parameters: Optional[Parameters] = None,
        **kwargs):
    
        if "fit_metrics_aggregation_fn" not in kwargs:
            kwargs["fit_metrics_aggregation_fn"] = self.weighted_average
        if "evaluate_metrics_aggregation_fn" not in kwargs:
            kwargs["evaluate_metrics_aggregation_fn"] = self.weighted_average

        super().__init__(**kwargs)
        self.dropout_rate_training = dropout_rate_training
        self.dropout_rate_eval = dropout_rate_eval
        self.fixed_clients = fixed_clients or []
        self.dropout_pattern_train = dropout_pattern_train
        self.dropout_pattern_eval = dropout_pattern_eval
        self.current_round = 0
        self.dropped_clients_history_training: Dict[int, List[int]] = {}
        self.dropped_clients_history_evaluation: Dict[int, List[int]] = {}

        # For tracking metrics
        self.fit_metrics_history: List[Dict[str, float]] = []
        self.eval_metrics_history: List[Dict[str, float]] = []

        self.net = net
        self.eta = eta
        self.rho = rho
        self.psi_max = psi_max
        self.t0 = t0
        self.b = b

        # Per-client state
        self.G: dict[str, list[np.ndarray]] = {}  # Surrogate updates
        self.tau: dict[str, int] = {}             # Inactivity counters
        self.psi: dict[str, float] = {}           # Staleness weights
        self.E: set[str] = set()                  # Ever-active clients
        self.initialized = False                  # Track if G is initialized
        self.current_params = initial_parameters  # Initialize here

    def initialize_parameters(self, client_manager: ClientManager) -> Optional[Parameters]:
        current_params = get_parameters(self.net)
        return ndarrays_to_parameters(current_params)

    def initialize_state(self, parameters: Parameters):
        """Initialize surrogates to zero for all clients in E."""
        if self.initialized: 
            return
        w_global = parameters_to_ndarrays(parameters)
        for cid in self.E:
            if cid not in self.G:  # Initialize new clients
                self.G[cid] = [np.zeros_like(arr, dtype=np.float32) for arr in w_global]
                self.tau[cid] = 0  # Reset inactivity counter
                self.psi[cid] = 1.0  # Default weight for active client
        self.initialized = True
        
    def weighted_average(self, metrics: List[Tuple[int, Dict]]) -> Dict:
        """Aggregate metrics using weighted average based on number of samples."""
        if not metrics:
            return {}

        total_examples = sum([num_examples for num_examples, _ in metrics])
        weighted_metrics = {}

        for metric_key in metrics[0][1].keys():
            weighted_sum = sum(
                metric_dict[metric_key] * num_examples
                for num_examples, metric_dict in metrics
                if metric_key in metric_dict
            )
            weighted_metrics[metric_key] = weighted_sum / total_examples if total_examples > 0 else 0

        return weighted_metrics


    def configure_fit( self, server_round: int, parameters: Parameters, client_manager: ClientManager) -> List[Tuple[ClientProxy, FitIns]]:
        """Configure the next round of training with client dropout."""
        self.current_round = server_round
        for client in client_manager.all().values():
            cid = client.cid
            if cid not in self.E:
                self.E.add(cid)
                
        sample_size, min_num_clients = self.num_fit_clients(
            client_manager.num_available()
        )

        chosen_clients = client_manager.sample(
            num_clients=sample_size,
            min_num_clients=min_num_clients,
        )

        fit_instructions: List[Tuple[ClientProxy, FitIns]] = []

        self.current_params = parameters

        

        self.initialize_state(parameters)

        for client in chosen_clients:
            fit_instructions.append((client, FitIns(self.current_params, {})))

        available_clients = self._apply_dropout(
            fit_instructions,
            dropout_rate=self.dropout_rate_training,
            dropout_pattern=self.dropout_pattern_train,
        )

        client_ids = [int(client.cid) for client, _ in fit_instructions]
        available_client_ids = [int(client.cid) for client, _ in available_clients]
        dropped_clients = [cid for cid in client_ids if cid not in available_client_ids]

        self.dropped_clients_history_training[server_round] = dropped_clients

        wandb.log({
            "train_dropout_history": len(dropped_clients)
        })

        print(f"Round {server_round}: {len(dropped_clients)} clients dropped out of {len(client_ids)} during training")
        print(f"Dropped client IDs: {dropped_clients}")

        return available_clients
    
        
    def configure_evaluate( self, server_round: int, parameters: Parameters, client_manager: ClientManager) -> List[Tuple[ClientProxy, EvaluateIns]]:
        self.current_round = server_round

        client_evaluate_instructions = super().configure_evaluate(
            server_round, parameters, client_manager
        )

        if not client_evaluate_instructions: return []

        available_clients = self._apply_dropout(client_evaluate_instructions, dropout_rate=self.dropout_rate_eval, dropout_pattern=self.dropout_pattern_eval)

        client_ids = [int(client.cid) for client, _ in client_evaluate_instructions]
        available_client_ids = [int(client.cid) for client, _ in available_clients]
        dropped_clients = [cid for cid in client_ids if cid not in available_client_ids]

        self.dropped_clients_history_evaluation[server_round] = dropped_clients
        wandb.log({
            "eval_dropout_history" : len(dropped_clients) 
        })

        print(f"Round {server_round}: {len(dropped_clients)} clients dropped out of {len(client_ids)} during evaluation")
        print(f"Dropped client IDs: {dropped_clients}")

        return available_clients

    def _apply_dropout(self, client_instructions: List[Tuple[ClientProxy, Union[FitIns, EvaluateIns ]]], dropout_pattern: str, dropout_rate: 0.3) -> List[Tuple[ClientProxy, FitIns]]:
        """Apply dropout to clients based on the specified pattern."""
        if len(client_instructions) == 0:
            return []

        # Get all client IDs
        all_clients = [(client, ins) for client, ins in client_instructions]
        all_client_ids = [int(client.cid) for client, _ in all_clients]

        # Determine which clients will drop out
        dropout_mask = [False] * len(all_clients)

        if dropout_pattern == "random":
           
            for i, cid in enumerate(all_client_ids):
                
                if cid in self.fixed_clients:
                    continue
            
                if random.random() < dropout_rate:
                    dropout_mask[i] = True

        elif dropout_pattern == "alternate":
         
            if self.current_round % 2 == 1:  
                for i, cid in enumerate(all_client_ids):
                    if cid not in self.fixed_clients:
                        dropout_mask[i] = True

        elif dropout_pattern == "fixed":
      
            n_dropout = int(len(all_clients) * dropout_rate)
            for i in range(n_dropout):
                if all_client_ids[i] not in self.fixed_clients:
                    dropout_mask[i] = True

        
        available_clients = [
            (client, ins) for i, (client, ins) in enumerate(all_clients)
            if not dropout_mask[i]
        ]

        return available_clients

    def g(self, server_round: int):
        return self.t0 + server_round/self.b 

    def aggregate_fit(self, server_round: int, results: List[Tuple[ClientProxy, FitRes]], failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]]):

        if not results:
            return None, {}

        #valid_results = [(client_proxy, res) for client_proxy, res in results if res.status.code == Code.OK]
        #if not valid_results:
            #return None, {}

        g_t = self.g(server_round)
        w_global = parameters_to_ndarrays(self.current_params)
        updated_cids = set()
        for client, fit_res in results:
            cid = client.cid
            w_client = parameters_to_ndarrays(fit_res.parameters)
            self.G[cid] = [
                (w_g - w_c) / self.eta 
                for w_g, w_c in zip(w_global, w_client)
            ]
            self.tau[cid] = 0     # Reset inactivity counter (Eq. 5)
            self.psi[cid] = 1.0   # Active clients: ψ=1 (Eq. 6)
            updated_cids.add(cid)
        print(len(updated_cids))
            
        for cid in self.E:
            if cid not in updated_cids:
                self.tau[cid] += 1
                
        for cid in self.E:
            tau_val = self.tau[cid]
            if tau_val >= g_t:  # Exclude stale clients
                self.psi[cid] = 0.0
            else:
                # ψ = min( (τ+1)^ρ, ψ_max )
                self.psi[cid] = min((tau_val + 1) ** self.rho, 2)
                
        contributing_cids = [client.cid for client, _ in results]
        N_t = len(contributing_cids)
        if N_t == 0:
            return None, {}

        weighted_update = None
        for cid in contributing_cids:
            psi_i = self.psi[cid]
            update_i = [psi_i * g_arr for g_arr in self.G[cid]]
            if weighted_update is None:
                weighted_update = update_i
            else:
                weighted_update = [
                    w_u + u_i for w_u, u_i in zip(weighted_update, update_i)
                ]
        avg_update = [arr / N_t for arr in weighted_update]

        new_weights = [
            w_global[i] - self.eta * avg_update[i]
            for i in range(len(w_global))
        ]
        aggregated = ndarrays_to_parameters(new_weights)
        self.current_params = aggregated

        if aggregated is not None:
            aggregated_ndarrays: list[np.ndarray] = fl.common.parameters_to_ndarrays(
                aggregated
            )

            params_dict = zip(self.net.state_dict().keys(), aggregated_ndarrays)
            state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict})
            self.net.load_state_dict(state_dict, strict=True)

            torch.save(self.net.state_dict(), f"model_round_{server_round}.pth")
        aggregated_metrics = {}
        if results:
            metrics = [(res.num_examples, res.metrics) for _, res in results]
            aggregated_metrics = self.weighted_average(metrics)
            self.fit_metrics_history.append(aggregated_metrics)

            wandb.log({
                "train_server_round": server_round, 
                "train_accuracy": aggregated_metrics.get("train_accuracy", 0.0), 
                "train_loss" : aggregated_metrics.get("train_loss", 0.0)
            })


            print(f"Round {server_round} training metrics: {aggregated_metrics}")

        return aggregated, aggregated_metrics

    def aggregate_evaluate( self, server_round: int, results: List[Tuple[ClientProxy, EvaluateRes]],  failures: List[Union[Tuple[ClientProxy, EvaluateRes], BaseException]]):
        
        aggregated = super().aggregate_evaluate(server_round, results, failures)

        if results:
            metrics = [(res.num_examples, res.metrics) for _, res in results]
            aggregated_metrics = self.weighted_average(metrics)
            self.eval_metrics_history.append(aggregated_metrics)

            print(f"Round {server_round} evaluation metrics: {aggregated_metrics}")

            wandb.log({
                "server_round_eval" : server_round,
                "test_accuracy": aggregated_metrics.get("test_accuracy", 0.0), 
                "test_loss" : aggregated_metrics.get("test_loss", 0.0), 
                 "test_f1": aggregated_metrics.get("test_f1", 0.0), 
                "test_precision" : aggregated_metrics.get("test_precision", 0.0), 
                 "test_recall": aggregated_metrics.get("test_recall", 0.0), 
            })

        return aggregated

    def get_dropout_history(self) -> Dict[int, List[int]]:
        return self.dropped_clients_history_training, self.dropped_clients_history_evaluation

    def get_metrics_history(self) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]: 
        return self.fit_metrics_history, self.eval_metrics_history

In [19]:
from flwr.client import ClientApp
from flwr.server import ServerApp
from flwr.simulation import run_simulation
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, List, Optional
import os
from flwr.common import Context
import torch 
import lightning as pl
from typing import Union

In [20]:
def run_dropout_experiment(
    client_fn_creator,
    pl_model : Union[pl.LightningModule, torch.nn.Module], 
    num_clients: int,
    num_rounds: int = 10,
    dropout_rate_training: float = 0.3,
    dropout_rate_eval: float = 0.3,
    dropout_pattern_train: str = "random",
    dropout_pattern_eval: str = "random",
    fixed_clients: Optional[List[int]] = None,
    experiment_name: str = "dropout_experiment",
    save_dir: str = "model_weights",
    num_gpus : int = 0, 
    resource_config : Optional[Dict[str, float]] = None,

):
    
      # Configure client app
    print(f"\nStarting experiment: {experiment_name}")
    print(f"Dropout rate training: {dropout_rate_training}, Pattern: {dropout_pattern_train}")
    print(f"Dropout rate evaluation: {dropout_rate_eval}, Pattern: {dropout_pattern_eval   }")
    print(f"Number of GPUs: {num_gpus}")
    print(f"Number of clients: {num_clients}")
    print(f"Number of rounds: {num_rounds}")
    print(f"Fixed clients: {fixed_clients or []}")

    # Create strategy with dropout
    strategy = FedAR(
        net=pl_model.model if isinstance(pl_model, pl.LightningModule) else pl_model,
        dropout_rate_training=dropout_rate_training,
        dropout_rate_eval=dropout_rate_eval,
        dropout_pattern_train=dropout_pattern_train,
        dropout_pattern_eval=dropout_pattern_eval,
        fixed_clients=fixed_clients or [],
        fraction_fit=1.0,
        fraction_evaluate=1.0,
        min_fit_clients=1,
        min_evaluate_clients=1,
        min_available_clients=1,

    )

    # Configure server with strategy
    def server_fn(server_context: Context):
        from flwr.server import ServerAppComponents, ServerConfig
        config = ServerConfig(num_rounds=num_rounds)
        return ServerAppComponents(strategy=strategy, config=config)


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    epochs = resource_config.get("epochs", 1) if resource_config else 1
    client_datasets = resource_config.get("client_datasets", {}) if resource_config else {}

    
    batch_size = resource_config.get("batch_size", 32) if resource_config else 32
    learning_rate = resource_config.get("learning_rate", 0.001) if resource_config else 0.001
    num_workers = resource_config.get("num_workers", 1) if resource_config else 1
    client_fn = client_fn_creator(device=device, epochs=epochs, client_datasets=client_datasets
                                , batch_size=batch_size, pl_model=pl_model, num_workers=num_workers)
    
    # Create client and server apps
    client_app = ClientApp(client_fn=client_fn)
    server_app = ServerApp(server_fn=server_fn)

    # Configure backend
    backend_config = {
        "client_resources": {
            "num_cpus": 1,
            "num_gpus": num_gpus,
        }
    }
    history = strategy.get_dropout_history()
    # Run simulation
    try:
        run_simulation(
            client_app=client_app,
            server_app=server_app,
            num_supernodes=num_clients,
            backend_config=backend_config,
        )

        # Get metrics directly from strategy
        fit_metrics, eval_metrics = strategy.get_metrics_history()

        # Format metrics for plotting
        rounds = list(range(1, len(eval_metrics) + 1))

        train_accuracy_values = [metrics.get("train_accuracy", 0.0) for metrics in fit_metrics]
        train_loss_values = [metrics.get("train_loss", 0.0) for metrics in fit_metrics]
        

        test_accuracy_values = [metrics.get("test_accuracy", 0.0) for metrics in eval_metrics]
        test_loss_values = [metrics.get("test_loss", 0.0) for metrics in eval_metrics]
        test_f1_values = [metrics.get("test_f1", 0.0) for metrics in eval_metrics]
        test_precision_values = [metrics.get("test_precision", 0.0) for metrics in eval_metrics]
        test_recall_values = [metrics.get("test_recall", 0.0) for metrics in eval_metrics]    


        # cleanup_wandb_loggers()

        results = {
            "rounds": rounds,
            "train_accuracy": train_accuracy_values,
            "train_loss": train_loss_values,

            "test_accuracy": test_accuracy_values,
            "test_loss": test_loss_values,
            "test_f1": test_f1_values,
            "test_precision": test_precision_values,
            "test_recall": test_recall_values
        }

        return results, history
    
    except Exception as e:
        print(f"Error in dropout experiment: {e}")
        import traceback
        traceback.print_exc()
        return {"error": str(e)}

# Main

In [21]:
from typing import List, Dict, Tuple, Optional, Union
import torch.nn as nn 
import hydra 
from omegaconf import DictConfig, OmegaConf
import logging 
import wandb 
import os 
import warnings
import lightning as pl 

## Config 

In [22]:
config = {
    "base_path": "/kaggle",
    "device": "cuda",  # from train.device
    "run_name": "dropout_0.5_2D_FedAR",  # resolved using experiment values
    "seed": 42,
    "num_clients": 10,
    "num_rounds": 10,
    "gpus": 1,
    
    "train": {
        "batch_size": 16,
        "learning_rate": 0.1,
        "epochs": 1,
        "device": "cuda",
        "num_workers": 2,
        "weight_decay": 0.0001,
        "scheduler": {
            "use": False,
            "type": "cosine",
            "warmup_epochs": 1,
            "min_lr": 0.01,
        }
    },

    "experiment": {
        "pattern_train": "random",
        "pattern_eval": "random",
        "dropout_rate_training": 0.0,
        "dropout_rate_eval": 0.0,
        "fixed_clients": [0, 1, 2, 3],
        "name": "dropout_0.0_2D_FedAR",
    },

    "data": {
        "root_path": "/kaggle/input/mri-dataset/datasetzip/not_skull_stripped",
        "label_path": "/kaggle/input/mri-label/label.csv",
        "val_ratio": 0.2,
        "overlap_ratio": 0.2,
        "distribution": "same",
    },
    "is_3d": False
}


In [23]:
def run_experiment_with_lightning(cfg_dict: dict) -> None:
    cfg: DictConfig = OmegaConf.create(cfg_dict)  # convert to DictConfig to retain dot-access
    logger =  logger = logging.getLogger(__name__)
    logger.info(f"Running experiment with config: {cfg.experiment.name}")
    logger.info(f"Config: {OmegaConf.to_yaml(cfg)}")

    wandb.login(
        key=WANDB_APIKEY 
    )

    wandb.init(
        project="federated-mri-server",
        name=f"{cfg.experiment.name}",
        config=OmegaConf.to_container(cfg, resolve=True),
        group="server"
    )

    device = cfg.device
    epochs = cfg.train.epochs

    logger.info("Loading model")

    net : nn.Module() = BrainMRINet()
    pl_model : pl.LightningModule = BrainMRILightningModule(net = net )


    
    # net: nn.Module = DenseNet()
    # pl_model: pl.LightningModule = DenseNetModule(net = net)

    logger.info("Loading dataset")
    is_3d = True if isinstance(pl_model, DenseNetModule) else False
    print(f"Is 3D: {is_3d}")

    full_dataset = MRIDataset(
        root_dir=cfg.data.root_path,
        label_path=cfg.data.label_path,
        is_3d=is_3d
    )

    logger.info(f"Dataset loaded successfully with len is {len(full_dataset)}")
    logger.info(f"Splitting dataset into {cfg.num_clients} clients")

    if cfg.data.distribution == "iid":
        client_datasets = iid_client_split(
            full_dataset,
            num_client=cfg.num_clients,
            val_ratio=cfg.data.val_ratio
        )
    elif cfg.data.distribution == "same":
        client_datasets = same_distribution_client_split(
            full_dataset,
            num_client=cfg.num_clients,
            val_ratio=cfg.data.val_ratio,
            overlap_ratio=cfg.data.overlap_ratio,
            root_dir=cfg.data.root_path,
            is_3d=is_3d
        )
    else:
        raise ValueError(f"Unknown distribution type: {cfg.data.distribution}")

    logger.info(f"Client datasets created successfully with {len(client_datasets)} clients")

    resources = {
        "client_datasets": client_datasets,
        "device": device,
        "epochs": epochs,
        "batch_size": cfg.train.batch_size,
        "learning_rate": cfg.train.learning_rate,
        "num_workers": cfg.train.num_workers
    }

    logger.info("Running experiments with PyTorch Lightning")

    results, history = run_dropout_experiment(
        client_fn_creator=create_lightning_client_fn,
        pl_model=pl_model,
        num_clients=cfg.num_clients,
        num_rounds=cfg.num_rounds,
        dropout_rate_training=cfg.experiment.dropout_rate_training,
        dropout_rate_eval=cfg.experiment.dropout_rate_eval,
        dropout_pattern_train=cfg.experiment.pattern_train,
        dropout_pattern_eval=cfg.experiment.pattern_eval,
        experiment_name=cfg.experiment.name,
        num_gpus=cfg.gpus,
        resource_config=resources
    )

    logger.info("Run successfully + wandb tracking")
    print(f"Result is {results}")

    # for idx in range(len(results["rounds"])):
    #     wandb.log({
    #         "round": idx,
    #         "train_accuracy": results["train_accuracy"][-1] if idx > len(results["train_accuracy"]) - 1 else results["train_accuracy"][idx],
    #         "train_loss": results["train_loss"][-1] if idx > len(results["train_loss"]) - 1 else results["train_loss"][idx],
    #         "test_accuracy": results["test_accuracy"][-1] if idx > len(results["test_accuracy"]) - 1 else results["test_accuracy"][idx],
    #         "test_loss": results["test_loss"][-1] if idx > len(results["test_loss"]) - 1 else results["test_loss"][idx],
    #         "test_f1": results["test_f1"][-1] if idx > len(results["test_f1"]) - 1 else results["test_f1"][idx],
    #         "test_precision": results["test_precision"][-1] if idx > len(results["test_precision"]) - 1 else results["test_precision"][idx],
    #         "test_recall": results["test_recall"][-1] if idx > len(results["test_recall"]) - 1 else results["test_recall"][idx],
    #     })

    # Log dropout history
    # for round_idx, dropped in history[0].items():
    #     wandb.log({
    #         "dropped_clients_train_count": len(dropped),
    #         "dropped_clients": wandb.Table(
    #             columns=["client_id"],
    #             data=[[client_id] for client_id in dropped]
    #         )
    #     })

    # for round_idx, dropped in history[1].items():
    #     wandb.log({
    #         "dropped_clients_eval_count": len(dropped),
    #         "dropped_clients": wandb.Table(
    #             columns=["client_id"],
    #             data=[[client_id] for client_id in dropped]
    #         )
    #     })

    wandb.finish()

    logger.info("Experiments completed successfully")
    logger.info(f"Client Dropout History: {history}")

    return results, history

In [24]:
run_experiment_with_lightning(config)

  return LooseVersion(v) >= LooseVersion(check)
  import imp  # pylint: disable=deprecated-module
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msang2222004[0m ([33msang2222004-uet-vnu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  self.scope.user = {"email": email}  # noqa


Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100%|██████████| 30.8M/30.8M [00:01<00:00, 22.2MB/s]


Is 3D: False


[92mINFO [0m:      Starting Flower ServerApp, config: num_rounds=10, no round_timeout
[92mINFO [0m:      
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      Evaluation returned no results (`None`)
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 1]



Starting experiment: dropout_0.0_2D_FedAR
Dropout rate training: 0.0, Pattern: random
Dropout rate evaluation: 0.0, Pattern: random
Number of GPUs: 1
Number of clients: 10
Number of rounds: 10
Fixed clients: []


[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 1: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(pid=460)[0m 2025-06-03 09:15:39.948298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=460)[0m E0000 00:00:1748942139.971412     460 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=460)[0m E0000 00:00:1748942139.978330     460 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(pid=459)[0m 2025-06-03 09:15:39.948188: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: 

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 1 training metrics: {'train_loss': 0.6954506263689461, 'train_accuracy': 0.5397329902871227, 'val_loss': 0.6638506806156953, 'val_accuracy': 0.5950948914169988, 'val_precision': 0.5950948914169988, 'val_recall': 0.5950948914169988, 'val_f1': 0.5950948914169988}
Round 1: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
[36m(ClientAppActor pid=459)[0m │         test/acc          │           0.625           │
[36m(ClientAppActor pid=459)[0m │          test/f1          │           0.625           │
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.6664143800735474     │
[36m(ClientAppActor pid=459)[0m │      test/precision       │           0.625           │
[36m(ClientAppActor pid=459)[0m │        test/recall        │           0.625           │
[36m(ClientAppActor pid=459)[0m └───────────────────────────┴───────────────────────────┘


[36m(ClientAppActor pid=459)[0m INFO: `Trainer.fit` stopped: `max_epochs=1` reached.
[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 4x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.5166666507720947     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.5166666507720947     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │     0.684148907661438     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.5166666507720947     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.5166666507720947     

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 4x across cluster][0m
[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 2]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.7425742745399475     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.7425742745399475     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6307017803192139     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.7425742745399475     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.7425742745399475     

[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_0 exists and is not empty.
[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=460)[0m INFO: 
[36m(ClientAppActor pid=460)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=460)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=460)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=460)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 2 training metrics: {'train_loss': 0.6649422445118389, 'train_accuracy': 0.5920745988952126, 'val_loss': 0.6482905142133987, 'val_accuracy': 0.6266108573645094, 'val_precision': 0.6266108573645094, 'val_recall': 0.6266108573645094, 'val_f1': 0.6266108573645094}
Round 2: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
[36m(ClientAppActor pid=459)[0m │         test/acc          │    0.4749999940395355     │
[36m(ClientAppActor pid=459)[0m │          test/f1          │    0.4749999940395355     │
[36m(ClientAppActor pid=459)[0m │         test/loss         │      0.6939697265625      │
[36m(ClientAppActor pid=459)[0m │      test/precision       │    0.4749999940395355     │
[36m(ClientAppActor pid=459)[0m │        test/recall        │    0.4749999940395355     │
[36m(ClientAppActor pid=459)[0m └───────────────────────────┴───────────────────────────┘
[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader

[36m(ClientAppActor pid=459)[0m INFO: `Trainer.fit` stopped: `max_epochs=1` reached.


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.6083333492279053     │
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.6083333492279053     │
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6560152173042297     │
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.6083333492279053     │
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.6083333492279053     │
[36m(ClientAppActor pid=460)[0m └───────────────────────────┴───────────────────────────┘


[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 4x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.574999988079071     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.574999988079071     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6715724468231201     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.574999988079071     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.574999988079071     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 4x across cluster][0m
[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 3]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 2 evaluation metrics: {'test_loss': 0.6602253396086274, 'test_accuracy': 0.5959079376368673, 'test_f1': 0.5959079376368673, 'test_precision': 0.5959079376368673, 'test_recall': 0.5959079376368673}
Round 3: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_0 exists and is not empty.
[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=460)[0m INFO: 
[36m(ClientAppActor pid=460)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=460)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=460)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=460)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 3 training metrics: {'train_loss': 0.6493892251093728, 'train_accuracy': 0.618139439542103, 'val_loss': 0.6351017388885217, 'val_accuracy': 0.6488110363597106, 'val_precision': 0.6488110363597106, 'val_recall': 0.6488110363597106, 'val_f1': 0.6488110363597106}
Round 3: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/acc          │    0.46666666865348816    │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │          test/f1          │    0.46666666865348816    │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.6884763836860657     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │      test/precision       │    0.46666666865348816    │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │        test/recall        │    0.46666666865348816    

[36m(ClientAppActor pid=460)[0m INFO: `Trainer.fit` stopped: `max_epochs=1` reached.
[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 3x across cluster][0m


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/acc          │    0.5333333611488342     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │          test/f1          │    0.5333333611488342     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.6674689054489136     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │      test/precision       │    0.5333333611488342     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │        test/recall        │    0.5333333611488342     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 5x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.5416666865348816     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.5416666865348816     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6695674061775208     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.5416666865348816     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.5416666865348816     

[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 4]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 3 evaluation metrics: {'test_loss': 0.6506723929752373, 'test_accuracy': 0.6018755375983261, 'test_f1': 0.6018755375983261, 'test_precision': 0.6018755375983261, 'test_recall': 0.6018755375983261}
Round 4: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_2 exists and is not empty.
[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=460)[0m INFO: 
[36m(ClientAppActor pid=460)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=460)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=460)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=460)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 4 training metrics: {'train_loss': 0.6334784268120331, 'train_accuracy': 0.6425090095935724, 'val_loss': 0.6148267088362817, 'val_accuracy': 0.6820459358518874, 'val_precision': 0.6820459358518874, 'val_recall': 0.6820459358518874, 'val_f1': 0.6820459358518874}
Round 4: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │            0.5            │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │            0.5            │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6780012845993042     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │            0.5            │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │            0.5            

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 3x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.6000000238418579     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.6000000238418579     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6475712656974792     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.6000000238418579     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.6000000238418579     

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 5x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.6666666865348816     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.6666666865348816     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6302539706230164     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.6666666865348816     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.6666666865348816     

[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 5]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 4 evaluation metrics: {'test_loss': 0.6435601292567078, 'test_accuracy': 0.6138107516983078, 'test_f1': 0.6138107516983078, 'test_precision': 0.6138107516983078, 'test_recall': 0.6138107516983078}
Round 5: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_2 exists and is not empty.
[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=460)[0m INFO: 
[36m(ClientAppActor pid=460)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=460)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=460)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=460)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 5 training metrics: {'train_loss': 0.6138293668864566, 'train_accuracy': 0.6662428354681434, 'val_loss': 0.6012655865037105, 'val_accuracy': 0.7075811166787557, 'val_precision': 0.7075811166787557, 'val_recall': 0.7075811166787557, 'val_f1': 0.7075811166787557}
Round 5: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/acc          │    0.5416666865348816     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │          test/f1          │    0.5416666865348816     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/loss         │     0.658025860786438     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │      test/precision       │    0.5416666865348816     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m │        test/recall        │    0.5416666865348816     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 4x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │           0.625           │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │           0.625           │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.6418447494506836     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │           0.625           │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │           0.625           

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 3x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.801980197429657     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.801980197429657     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5678902268409729     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.801980197429657     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.801980197429657     

[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 6]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 5 evaluation metrics: {'test_loss': 0.625996288486962, 'test_accuracy': 0.6632566127834011, 'test_f1': 0.6632566127834011, 'test_precision': 0.6632566127834011, 'test_recall': 0.6632566127834011}
Round 6: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_1 exists and is not empty.
[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=460)[0m INFO: 
[36m(ClientAppActor pid=460)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=460)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=460)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=460)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 6 training metrics: {'train_loss': 0.5914670087367101, 'train_accuracy': 0.695698235140536, 'val_loss': 0.5797211717538698, 'val_accuracy': 0.7289100263577999, 'val_precision': 0.7289100263577999, 'val_recall': 0.7289100263577999, 'val_f1': 0.7289100263577999}
Round 6: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.7083333134651184     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.7083333134651184     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5864861607551575     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.7083333134651184     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.7083333134651184     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 2x across cluster][0m


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/acc          │     0.752136766910553     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │          test/f1          │     0.752136766910553     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.5846002101898193     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │      test/precision       │     0.752136766910553     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │        test/recall        │     0.752136766910553     

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 6x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 6x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 6x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 6x across cluster][0m


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/acc          │    0.7166666388511658     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │          test/f1          │    0.7166666388511658     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.6095553636550903     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │      test/precision       │    0.7166666388511658     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m │        test/recall        │    0.7166666388511658     

[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 7]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 6 evaluation metrics: {'test_loss': 0.5983696473224084, 'test_accuracy': 0.7075873747506105, 'test_f1': 0.7075873747506105, 'test_precision': 0.7075873747506105, 'test_recall': 0.7075873747506105}
Round 7: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_0 exists and is not empty.
[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=459)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found u

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 7 training metrics: {'train_loss': 0.57215770026288, 'train_accuracy': 0.7143462560873564, 'val_loss': 0.5595616697759844, 'val_accuracy': 0.7510805226125109, 'val_precision': 0.7510805226125109, 'val_recall': 0.7510805226125109, 'val_f1': 0.7510805226125109}
Round 7: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.782608687877655     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.782608687877655     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5598809719085693     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.782608687877655     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.782608687877655     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 4x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.699999988079071     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.699999988079071     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5937467813491821     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.699999988079071     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.699999988079071     

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 3x across cluster][0m
[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 8]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.7692307829856873     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.7692307829856873     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5607014298439026     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.7692307829856873     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.7692307829856873     

[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_0 exists and is not empty.
[36m(ClientAppActor pid=460)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=460)[0m INFO: 
[36m(ClientAppActor pid=460)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=460)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=460)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=460)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=460)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 8 training metrics: {'train_loss': 0.5540827075968761, 'train_accuracy': 0.7387158244968244, 'val_loss': 0.536619291325246, 'val_accuracy': 0.7783450545408787, 'val_precision': 0.7783450545408787, 'val_recall': 0.7783450545408787, 'val_f1': 0.7783450545408787}
Round 8: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
[36m(ClientAppActor pid=459)[0m │         test/acc          │    0.7916666865348816     │
[36m(ClientAppActor pid=459)[0m │          test/f1          │    0.7916666865348816     │
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.5412177443504333     │
[36m(ClientAppActor pid=459)[0m │      test/precision       │    0.7916666865348816     │
[36m(ClientAppActor pid=459)[0m │        test/recall        │    0.7916666865348816     │
[36m(ClientAppActor pid=459)[0m └───────────────────────────┴───────────────────────────┘
[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 2x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.791304349899292     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.791304349899292     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5387614965438843     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.791304349899292     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.791304349899292     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 5x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.7916666865348816     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.7916666865348816     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5510151982307434     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.7916666865348816     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.7916666865348816     

[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 9]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


Round 8 evaluation metrics: {'test_loss': 0.5484245953047672, 'test_accuracy': 0.7715260072945532, 'test_f1': 0.7715260072945532, 'test_precision': 0.7715260072945532, 'test_recall': 0.7715260072945532}
Round 9: 0 clients dropped out of 10 during training
Dropped client IDs: []


[36m(ClientAppActor pid=459)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_1 exists and is not empty.
[36m(ClientAppActor pid=459)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=459)[0m INFO: 
[36m(ClientAppActor pid=459)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=459)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=459)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=459)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=459)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=459)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 9 training metrics: {'train_loss': 0.5296107274315371, 'train_accuracy': 0.7660521288296004, 'val_loss': 0.5168221042832724, 'val_accuracy': 0.7911317986713473, 'val_precision': 0.7911317986713473, 'val_recall': 0.7911317986713473, 'val_f1': 0.7911317986713473}
Round 9: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.7333333492279053     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.7333333492279053     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5602340698242188     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.7333333492279053     │[32m [repeated 2x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.7333333492279053     

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 4x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.800000011920929     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.800000011920929     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5209615230560303     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.800000011920929     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.800000011920929     

[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 3x across cluster][0m
[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 10]
[92mINFO [0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/acc          │     0.800000011920929     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │          test/f1          │     0.800000011920929     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │         test/loss         │    0.5350551605224609     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │      test/precision       │     0.800000011920929     │[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m │        test/recall        │     0.800000011920929     

[36m(ClientAppActor pid=459)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working/checkpoints/client_0 exists and is not empty.
[36m(ClientAppActor pid=459)[0m /usr/local/lib/python3.11/dist-packages/lightning/pytorch/core/optimizer.py:378: Found unsupported keys in the optimizer configuration: {'gradient_clip_val'}
[36m(ClientAppActor pid=459)[0m INFO: 
[36m(ClientAppActor pid=459)[0m    | Name           | Type                | Params | Mode 
[36m(ClientAppActor pid=459)[0m ----------------------------------------------------------------
[36m(ClientAppActor pid=459)[0m 0  | model          | BrainMRINet         | 7.2 M  | train
[36m(ClientAppActor pid=459)[0m 1  | train_acc      | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=459)[0m 2  | val_acc        | MulticlassAccuracy  | 0      | train
[36m(ClientAppActor pid=459)[0m 3  | test_acc       | MulticlassAccuracy  | 0      |

10


[92mINFO [0m:      configure_evaluate: strategy sampled 10 clients (out of 10)


Round 10 training metrics: {'train_loss': 0.5095909397514676, 'train_accuracy': 0.7808857814415352, 'val_loss': 0.4922794470230789, 'val_accuracy': 0.8192693996944779, 'val_precision': 0.8192693996944779, 'val_recall': 0.8192693996944779, 'val_f1': 0.8192693996944779}
Round 10: 0 clients dropped out of 10 during evaluation
Dropped client IDs: []


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: `Trainer.fit` stopped: `max_epochs=1` reached.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
[36m(ClientAppActor pid=460)[0m │         test/acc          │     0.824999988079071     │
[36m(ClientAppActor pid=460)[0m │          test/f1          │     0.824999988079071     │
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.49893683195114136    │
[36m(ClientAppActor pid=460)[0m │      test/precision       │     0.824999988079071     │
[36m(ClientAppActor pid=460)[0m │        test/recall        │     0.824999988079071     │
[36m(ClientAppActor pid=460)[0m └───────────────────────────┴───────────────────────────┘
[36m(ClientAppActor pid=459)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
[36m(ClientAppActor pid=459)[0m ┃        Test metric        ┃       DataLoader

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 3x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.8260869383811951     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.8260869383811951     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.49357813596725464    │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.8260869383811951     │[32m [repeated 3x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.8260869383811951     

[36m(ClientAppActor pid=459)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: GPU available: True (cuda), used: True[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: TPU available: False, using: 0 TPU cores[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: HPU available: False, using: 0 HPUs[32m [repeated 5x across cluster][0m
[36m(ClientAppActor pid=459)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1][32m [repeated 5x across cluster][0m


[36m(ClientAppActor pid=460)[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┃        Test metric        ┃       DataLoader 0        ┃[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/acc          │    0.8166666626930237     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │          test/f1          │    0.8166666626930237     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │         test/loss         │    0.5131483674049377     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │      test/precision       │    0.8166666626930237     │[32m [repeated 4x across cluster][0m
[36m(ClientAppActor pid=460)[0m │        test/recall        │    0.8166666626930237     

[92mINFO [0m:      aggregate_evaluate: received 10 results and 0 failures
[92mINFO [0m:      
[92mINFO [0m:      [SUMMARY]
[92mINFO [0m:      Run finished 10 round(s) in 1310.77s
[92mINFO [0m:      	History (loss, distributed):
[92mINFO [0m:      		round 1: 0.6735665339846338
[92mINFO [0m:      		round 2: 0.6602253396086274
[92mINFO [0m:      		round 3: 0.6506723929752373
[92mINFO [0m:      		round 4: 0.6435601292567078
[92mINFO [0m:      		round 5: 0.625996288486962
[92mINFO [0m:      		round 6: 0.5983696473224084
[92mINFO [0m:      		round 7: 0.5722826218259507
[92mINFO [0m:      		round 8: 0.5484245953047672
[92mINFO [0m:      		round 9: 0.532280720840228
[92mINFO [0m:      		round 10: 0.5023106954490031
[92mINFO [0m:      	History (metrics, distributed, fit):
[92mINFO [0m:      	{'train_accuracy': [(1, 0.5397329902871227),
[92mINFO [0m:      	                    (2, 0.5920745988952126),
[92mINFO [0m:      	                    (3, 0.618139439542

Round 10 evaluation metrics: {'test_loss': 0.5023106954490031, 'test_accuracy': 0.8226768916626822, 'test_f1': 0.8226768916626822, 'test_precision': 0.8226768916626822, 'test_recall': 0.8226768916626822}


[36m(ClientAppActor pid=460)[0m INFO: Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(ClientAppActor pid=460)[0m INFO: GPU available: True (cuda), used: True
[36m(ClientAppActor pid=460)[0m INFO: TPU available: False, using: 0 TPU cores
[36m(ClientAppActor pid=460)[0m INFO: HPU available: False, using: 0 HPUs
[36m(ClientAppActor pid=460)[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Result is {'rounds': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'train_accuracy': [0.5397329902871227, 0.5920745988952126, 0.618139439542103, 0.6425090095935724, 0.6662428354681434, 0.695698235140536, 0.7143462560873564, 0.7387158244968244, 0.7660521288296004, 0.7808857814415352], 'train_loss': [0.6954506263689461, 0.6649422445118389, 0.6493892251093728, 0.6334784268120331, 0.6138293668864566, 0.5914670087367101, 0.57215770026288, 0.5540827075968761, 0.5296107274315371, 0.5095909397514676], 'test_accuracy': [0.5745950595802053, 0.5959079376368673, 0.6018755375983261, 0.6138107516983078, 0.6632566127834011, 0.7075873747506105, 0.7280477415468585, 0.7715260072945532, 0.7860187686922605, 0.8226768916626822], 'test_loss': [0.6735665339846338, 0.6602253396086274, 0.6506723929752373, 0.6435601292567078, 0.625996288486962, 0.5983696473224084, 0.5722826218259507, 0.5484245953047672, 0.532280720840228, 0.5023106954490031], 'test_f1': [0.5745950595802053, 0.5959079376368673, 0.6018755375983261, 0.61381075

0,1
eval_dropout_history,▁▁▁▁▁▁▁▁▁▁
server_round_eval,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁▂▂▂▄▅▅▇▇█
test_f1,▁▂▂▂▄▅▅▇▇█
test_loss,█▇▇▇▆▅▄▃▂▁
test_precision,▁▂▂▂▄▅▅▇▇█
test_recall,▁▂▂▂▄▅▅▇▇█
train_accuracy,▁▃▃▄▅▆▆▇██
train_dropout_history,▁▁▁▁▁▁▁▁▁▁
train_loss,█▇▆▆▅▄▃▃▂▁

0,1
eval_dropout_history,0.0
server_round_eval,10.0
test_accuracy,0.82268
test_f1,0.82268
test_loss,0.50231
test_precision,0.82268
test_recall,0.82268
train_accuracy,0.78089
train_dropout_history,0.0
train_loss,0.50959


({'rounds': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  'train_accuracy': [0.5397329902871227,
   0.5920745988952126,
   0.618139439542103,
   0.6425090095935724,
   0.6662428354681434,
   0.695698235140536,
   0.7143462560873564,
   0.7387158244968244,
   0.7660521288296004,
   0.7808857814415352],
  'train_loss': [0.6954506263689461,
   0.6649422445118389,
   0.6493892251093728,
   0.6334784268120331,
   0.6138293668864566,
   0.5914670087367101,
   0.57215770026288,
   0.5540827075968761,
   0.5296107274315371,
   0.5095909397514676],
  'test_accuracy': [0.5745950595802053,
   0.5959079376368673,
   0.6018755375983261,
   0.6138107516983078,
   0.6632566127834011,
   0.7075873747506105,
   0.7280477415468585,
   0.7715260072945532,
   0.7860187686922605,
   0.8226768916626822],
  'test_loss': [0.6735665339846338,
   0.6602253396086274,
   0.6506723929752373,
   0.6435601292567078,
   0.625996288486962,
   0.5983696473224084,
   0.5722826218259507,
   0.5484245953047672,
   0.532280720840228,