# Federated Learning Tutorial v7: Comprehensive Comparison

This notebook provides a complete comparison of different federated learning optimization strategies:

## Experiments:
1. **Baseline: Full FL (FP32)** - Standard federated learning with full precision
2. **Optimization 1: Full FL (FP16/AMP)** - Mixed precision training for speed
3. **Optimization 2: LoRA (FP32)** - Low-rank adaptation for communication efficiency
4. **Optimization 3: LoRA (FP16/AMP)** - Combined LoRA + Mixed Precision

## Metrics Tracked:
- **Accuracy**: Final test accuracy
- **Training Time**: Wall-clock time per experiment
- **Communication Cost**: Estimated parameter transfer size
- **Convergence**: Loss curves across rounds

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, datasets
from model import ConvNet2, LoRAConvNet2
import numpy as np
import time
import pandas as pd
import copy

## 1. Pre-training Initialization

We pre-train the backbone on a small subset to provide a stable starting point for all experiments.

In [None]:
def get_pretrain_loader(data_path, batch_size=16, num_samples=100):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.2, 0.2, 0.2]),
    ])
    
    site1_train_path = os.path.join(data_path, "site1", "train")
    full_dataset = datasets.ImageFolder(root=site1_train_path, transform=transform)
    
    indices = np.random.choice(len(full_dataset), num_samples, replace=False)
    subset = Subset(full_dataset, indices)
    
    return DataLoader(subset, batch_size=batch_size, shuffle=True)

def pretrain_model(model, loader, epochs=5, device="cpu"):
    model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Pre-train Epoch {epoch+1} Loss: {running_loss/len(loader):.4f}")
    
    return model

data_path = os.path.abspath("chest_xray")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("---- Starting Pre-training Initialization ----")
base_model = ConvNet2()
pretrain_loader = get_pretrain_loader(data_path)
initialized_model = pretrain_model(base_model, pretrain_loader, device=device)
print("---- Pre-training Done ----")

## 2. Configuration

In [None]:
from nvflare.app_opt.pt.recipes.fedavg import FedAvgRecipe
from nvflare.recipe import SimEnv, add_experiment_tracking

n_clients = 3
num_rounds = 5
batch_size = 16

# Create simulation environment once
env = SimEnv(num_clients=n_clients)

# Storage for results
results = []

## 3. Experiment 1: Full FL (FP32) - Baseline

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 1: Full Federated Learning (FP32)")
print("="*60)

model_full_fp32 = copy.deepcopy(initialized_model)
job_name = "chest-xray-full-fp32"

recipe = FedAvgRecipe(
    name=job_name,
    min_clients=n_clients,
    num_rounds=num_rounds,
    model=model_full_fp32,
    train_script="client_xray.py",
    train_args=f"--batch_size {batch_size} --epochs 1 --data_path {data_path} --model_type full --use_amp False",
)

add_experiment_tracking(recipe, tracking_type="tensorboard")

start_time = time.time()
run = recipe.execute(env)
duration = time.time() - start_time

print(f"\nStatus: {run.get_status()}")
print(f"Duration: {duration:.2f} seconds")
print(f"Result: {run.get_result()}")

results.append({
    "Experiment": "Full FL (FP32)",
    "Method": "Full Parameter Tuning",
    "Precision": "FP32",
    "Duration (sec)": round(duration, 2),
    "Params Updated": "100%",
    "Communication Cost": "High"
})

## 4. Experiment 2: Full FL (FP16/AMP) - Mixed Precision

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 2: Full Federated Learning (FP16/AMP)")
print("="*60)

model_full_fp16 = copy.deepcopy(initialized_model)
job_name = "chest-xray-full-fp16"

recipe = FedAvgRecipe(
    name=job_name,
    min_clients=n_clients,
    num_rounds=num_rounds,
    model=model_full_fp16,
    train_script="client_xray.py",
    train_args=f"--batch_size {batch_size} --epochs 1 --data_path {data_path} --model_type full --use_amp True",
)

add_experiment_tracking(recipe, tracking_type="tensorboard")

start_time = time.time()
run = recipe.execute(env)
duration = time.time() - start_time

print(f"\nStatus: {run.get_status()}")
print(f"Duration: {duration:.2f} seconds")
print(f"Result: {run.get_result()}")

results.append({
    "Experiment": "Full FL (FP16/AMP)",
    "Method": "Full Parameter Tuning",
    "Precision": "FP16 (Mixed)",
    "Duration (sec)": round(duration, 2),
    "Params Updated": "100%",
    "Communication Cost": "High"
})

## 5. Experiment 3: LoRA (FP32) - Low-Rank Adaptation

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 3: LoRA Federated Learning (FP32)")
print("="*60)

model_lora_fp32 = LoRAConvNet2(rank=8, base_model=copy.deepcopy(initialized_model))
job_name = "chest-xray-lora-fp32"

recipe = FedAvgRecipe(
    name=job_name,
    min_clients=n_clients,
    num_rounds=num_rounds,
    model=model_lora_fp32,
    train_script="client_xray.py",
    train_args=f"--batch_size {batch_size} --epochs 1 --data_path {data_path} --model_type lora --use_amp False",
)

add_experiment_tracking(recipe, tracking_type="tensorboard")

start_time = time.time()
run = recipe.execute(env)
duration = time.time() - start_time

print(f"\nStatus: {run.get_status()}")
print(f"Duration: {duration:.2f} seconds")
print(f"Result: {run.get_result()}")

results.append({
    "Experiment": "LoRA (FP32)",
    "Method": "Low-Rank Adaptation",
    "Precision": "FP32",
    "Duration (sec)": round(duration, 2),
    "Params Updated": "~1%",
    "Communication Cost": "Low"
})

## 6. Experiment 4: LoRA (FP16/AMP) - Combined Optimization

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 4: LoRA Federated Learning (FP16/AMP)")
print("="*60)

model_lora_fp16 = LoRAConvNet2(rank=8, base_model=copy.deepcopy(initialized_model))
job_name = "chest-xray-lora-fp16"

recipe = FedAvgRecipe(
    name=job_name,
    min_clients=n_clients,
    num_rounds=num_rounds,
    model=model_lora_fp16,
    train_script="client_xray.py",
    train_args=f"--batch_size {batch_size} --epochs 1 --data_path {data_path} --model_type lora --use_amp True",
)

add_experiment_tracking(recipe, tracking_type="tensorboard")

start_time = time.time()
run = recipe.execute(env)
duration = time.time() - start_time

print(f"\nStatus: {run.get_status()}")
print(f"Duration: {duration:.2f} seconds")
print(f"Result: {run.get_result()}")

results.append({
    "Experiment": "LoRA (FP16/AMP)",
    "Method": "Low-Rank Adaptation",
    "Precision": "FP16 (Mixed)",
    "Duration (sec)": round(duration, 2),
    "Params Updated": "~1%",
    "Communication Cost": "Low"
})

## 7. Results Summary

In [None]:
print("\n" + "="*60)
print("FINAL COMPARISON SUMMARY")
print("="*60 + "\n")

df = pd.DataFrame(results)
display(df)

# Calculate speedups
baseline_time = df[df['Experiment'] == 'Full FL (FP32)']['Duration (sec)'].values[0]
print("\n" + "="*60)
print("SPEEDUP ANALYSIS (vs Full FL FP32 Baseline)")
print("="*60)
for _, row in df.iterrows():
    speedup = baseline_time / row['Duration (sec)']
    print(f"{row['Experiment']:25s}: {speedup:.2f}x speedup")

## 8. Visualize Results with TensorBoard

All experiments are logged to TensorBoard for detailed comparison of loss curves and accuracy.

In [None]:
%load_ext tensorboard
%tensorboard --bind_all --logdir /tmp/nvflare/simulation

## Key Takeaways

### Expected Results:
1. **FP16/AMP** should provide ~2-3x speedup over FP32 on GPU
2. **LoRA** reduces communication overhead by ~99% (only ~1% of parameters)
3. **LoRA + FP16** combines both benefits for maximum efficiency
4. **Accuracy** should remain comparable across all methods

### Trade-offs:
- **Full FL (FP32)**: Highest accuracy potential, slowest, highest communication cost
- **Full FL (FP16)**: Faster training, same communication cost, minimal accuracy loss
- **LoRA (FP32)**: Reduced communication, slightly slower than full FP16
- **LoRA (FP16)**: Best overall efficiency, minimal accuracy trade-off