In [1]:
# ── A. Imports & Reproducibility ────────────────────────────────────────────────
import os, copy
import csv                                                  # For result logging :contentReference[oaicite:0]{index=0}
import random                                               # For seeding :contentReference[oaicite:1]{index=1}
import numpy as np                                          # For numeric ops :contentReference[oaicite:2]{index=2}
import torch                                               # Core PyTorch :contentReference[oaicite:3]{index=3}
import torch.nn as nn                                       # Neural-net modules :contentReference[oaicite:4]{index=4}
import torch.nn.functional as F                             # Functional API :contentReference[oaicite:5]{index=5}
import torch.optim as optim                                 # Optimizers :contentReference[oaicite:6]{index=6}
from torch.optim.lr_scheduler import CosineAnnealingLR      # Scheduler :contentReference[oaicite:7]{index=7}
from torch.utils.data import DataLoader, random_split       # Data loaders & splits :contentReference[oaicite:8]{index=8}
import torchvision                                          # Datasets & transforms :contentReference[oaicite:9]{index=9}
import torchvision.transforms as T                          # Transforms :contentReference[oaicite:10]{index=10}
from torch.utils.tensorboard import SummaryWriter           # TensorBoard logging :contentReference[oaicite:11]{index=11}
import matplotlib.pyplot as plt                             # Plotting :contentReference[oaicite:12]{index=12}

In [2]:
# Seed everything for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


# ── B. Device ───────────────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")                             # Confirm GPU vs CPU :contentReference[oaicite:13]{index=13}



# ── C. Data Preparation ─────────────────────────────────────────────────────────
# Transforms
transform_train = T.Compose([
    T.RandomCrop(32, padding=4), T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761)),
])
transform_test = T.Compose([
    T.ToTensor(),
    T.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761)),
])


Using device: cuda


In [3]:
import glob, torch, os


def latest_ckpt(dirpath, pattern="last_ckpt__round_*.pth"):
    paths = glob.glob(os.path.join(dirpath, pattern))
    if not paths:
        return None
    paths.sort(key=lambda p: int(p.rsplit("_", 1)[1].split(".")[0]))
    return paths[-1]


def load_checkpoint(model, optimizer, ckpt_dir, resume=True,name=""):
    if not resume:
        print("[Checkpoint] Starting training from scratch.")
        return 1
    if name:
        pattern='last_ckpt_'+name+'_round_*.pth'
        ckpt_path = latest_ckpt(ckpt_dir,pattern)
    else:
        ckpt_path = latest_ckpt(ckpt_dir)
    if ckpt_path is None:
        print("[Checkpoint] No checkpoint found; training from scratch.")
        return 1
    # Load checkpoint tensors onto CPU to preserve RNG state tensor
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt['model_state'])
    optimizer.load_state_dict(ckpt['optimizer_state'])
    # Restore CPU RNG state
    rng_state = ckpt['rng_state']
    if rng_state.device.type != 'cpu':
        rng_state = rng_state.cpu()
    torch.set_rng_state(rng_state)
    print(f"[Checkpoint] Resumed from round {ckpt['round']} (loaded {os.path.basename(ckpt_path)})")
    return ckpt['round'] + 1


def save_checkpoint(model, optimizer, round_num, ckpt_dir,personalized_par_string="", is_best=False):
    print(f"[Checkpoint] Saving round {round_num}...")
    state = {
        'round': round_num,
        'model_state': model.state_dict(),
        'optimizer_state': optimizer.state_dict(),
        'rng_state': torch.get_rng_state(),
    }
    fname = f"{'best' if is_best else 'last'}_ckpt_{personalized_par_string}_round_{round_num}.pth"
    half_name=f"last_ckpt_{personalized_par_string}_round_"
    if is_best:
        torch.save(model.state_dict(), os.path.join(ckpt_dir,fname))
    else:
            torch.save(state, os.path.join(ckpt_dir, fname))

    print(f"[Checkpoint] Done saving to {fname}")

    for existing in os.listdir(ckpt_dir):
         existing_path = os.path.join(ckpt_dir, existing)
         if (
             existing.endswith('.pth')
             and existing != fname
             and 'best' not in existing
             and half_name in existing
         ):
             os.remove(existing_path)
             print(f"  Deleted: {existing}")






# def load_checkpoint(model, optimizer, ckpt_dir, resume=True):
#     if not resume:
#         print("[Checkpoint] Starting training from scratch.")
#         return 1
#     ckpt_path = latest_ckpt(ckpt_dir,'last_ckpt_round_lambda_\d_*.pth')  #path to ignore first number that is parameter of lamda
#     if ckpt_path is None:
#         print("[Checkpoint] No checkpoint found; training from scratch.")
#         return 1
#     # Load checkpoint tensors onto CPU to preserve RNG state tensor
#     ckpt = torch.load(ckpt_path, map_location='cpu')
#     model.load_state_dict(ckpt['model_state'])
#     optimizer.load_state_dict(ckpt['optimizer_state'])
#     # Restore CPU RNG state
#     rng_state = ckpt['rng_state']
#     if rng_state.device.type != 'cpu':
#         rng_state = rng_state.cpu()
#     torch.set_rng_state(rng_state)
#     print(f"[Checkpoint] Resumed from round {ckpt['round']} (loaded {os.path.basename(ckpt_path)})")
#     return ckpt['round'] + 1





In [4]:
# ── C. Data Preparation ─────────────────────────────────────────────────────────
# Transforms (as before)…

# Download full CIFAR‑100 training set
full_train = torchvision.datasets.CIFAR100(
    root='./data', train=True, download=True, transform=transform_train
)

# 1) Centralized validation split
val_size   = 5000
train_size = len(full_train) - val_size
train_dataset, val_dataset = random_split(
    full_train,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(seed)
)

train_loader_all = DataLoader(
    train_dataset, batch_size=256, shuffle=False, num_workers=2
)


# ── C.1 Build validation loader ───────────────────────────────
BS_VAL = 256
val_loader = DataLoader(
    val_dataset,
    batch_size=BS_VAL,
    shuffle=False,
    num_workers=2
)
 ###chang ehere 

# 2) IID sharding of the remaining train_dataset into K=100 clients
K = 100
base = train_size // K
sizes = [base] * (K - 1) + [train_size - base * (K - 1)]
shards = random_split(
    train_dataset,
    sizes,
    generator=torch.Generator().manual_seed(seed)
    )


# 3) Global test set (unchanged)
test_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=False, download=True, transform=transform_test
)

bs_test = 256
test_loader = DataLoader(
    test_dataset, batch_size=bs_test, shuffle=False, num_workers=2
)

# 4) (Later) you can build per-client loaders:
# client_loaders = [
#     DataLoader(shards[i], batch_size=bs, shuffle=True, num_workers=2)
#     for i in range(K)
# ]

"la zanzara concordo"

100%|██████████| 169M/169M [00:37<00:00, 4.47MB/s] 


'la zanzara concordo'

In [7]:
# ── D. Model Definition ─────────────────────────────────────────────────────────
class LELeNetCIFAR(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=5, padding=2)
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(2)
        self.fc1   = nn.Linear(64*8*8, 384)
        self.fc2   = nn.Linear(384, 192)
        self.fc3   = nn.Linear(192, 100)
    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x)); x = F.relu(self.fc2(x))
        return self.fc3(x)

In [8]:
# ── E. Utilities: Train/Eval & Checkpointing ────────────────────────────────────
def train_one_epoch(model, optimizer, criterion, loader):
    model.train()
    running_loss = correct = total = 0
    for imgs, lbls in loader:
        imgs, lbls = imgs.to(device), lbls.to(device)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, lbls)
        loss.backward(); optimizer.step()
        running_loss += loss.item()*imgs.size(0)
        correct += out.argmax(1).eq(lbls).sum().item()
        total += lbls.size(0)
    return running_loss/total, correct/total

def eval_model(model, criterion, loader):
    model.eval()
    running_loss = correct = total = 0
    with torch.no_grad():
        for imgs, lbls in loader:
            imgs, lbls = imgs.to(device), lbls.to(device)
            out = model(imgs); loss = criterion(out, lbls)
            running_loss += loss.item()*imgs.size(0)
            correct += out.argmax(1).eq(lbls).sum().item()
            total += lbls.size(0)
    return running_loss/total, correct/total


def sample_clients_dirichlet(K, m, gamma, rng):
    """
    Sample m clients out of K with probabilities drawn from a Dirichlet(gamma) distribution.
    Returns:
      selected: list of client indices
      p:      numpy array of length K with the sampling probabilities
    """
    if gamma == 'uniform':
        p = np.ones(K) / K
    else:
        alpha = np.ones(K) * gamma
        p = rng.dirichlet(alpha)
    selected = rng.choice(K, size=m, replace=False, p=p)
    return selected.tolist(), p

#unused below
def sample_clients_dirichlet2(K, m, gamma, rng=None):  #check which one to use
    """
    Sample m clients out of K with probabilities drawn from a Dirichlet(gamma) distribution.
    Returns:
      selected: list of client indices
      p:      numpy array of length K with the sampling probabilities
    """
    rng = np.random.default_rng(seed)
    if gamma == 'uniform':
        p = np.ones(K) / K
        selected = rng.choice(K, size=m, replace=False)
    else:
        alpha = np.ones(K) * gamma
        p = rng.dirichlet(alpha)
        p = p / p.sum()                          # ensure sum=1
        selected = rng.choice(K, size=m, replace=False, p=p)
    return selected.tolist(), p


In [9]:
import os
import csv
import copy
import pandas as pd
from datetime import datetime
import json
def clean_results_history(results_file_name,new_file_name):
    input_file = results_file_name
    output_file=new_file_name


    # Read and clean lines
    with open(input_file, "r") as f:
        lines = [line.strip() for line in f if line.strip()]

    filtered = []
    last_seen_index = float('inf')  # Start with a very large number
    header = lines[0]
    data_lines = lines[1:]

    # Iterate in reverse
    for line in reversed(data_lines):
        current_index = int(line.split(',')[0])
        if current_index < last_seen_index:
            filtered.append(line)
            last_seen_index = current_index
        else:
            # Skip this line, as its index is higher than the next one
            continue

    # Reverse again to restore original order (except removed lines)
    filtered.reverse()

    # Write to output
    with open(output_file, "w") as f:
        f.write(header+"\n")

        f.write('\n'.join(filtered))

    print(f"Filtered results written to {output_file}")



def get_results(csv_path):
    df = pd.read_csv(csv_path)

    results = {}

    for split in ['train', 'val', 'test']:
        acc_col = f"{split}_acc"
        loss_col = f"{split}_loss"

        max_acc = df[acc_col].max()
        max_idx = df[acc_col].idxmax()

        max_round = df.loc[max_idx, 'round']
        loss_at_max = df.loc[max_idx, loss_col]

        results[split] = {
            'max_acc': max_acc,
            'round': int(max_round),
            'loss_at_max': loss_at_max
        }

    return results





def write_final_results(name, params, csv_path='results_log.csv', results_csv_path='global_results.csv'):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    results = get_results(csv_path)

    row = {
        'timestamp': timestamp,
        'model_name': name,
        'parameters': json.dumps(params),
        'train_max_acc': results['train']['max_acc'],
        'train_round': results['train']['round'],
        'train_loss': results['train']['loss_at_max'],
        'val_max_acc': results['val']['max_acc'],
        'val_round': results['val']['round'],
        'val_loss': results['val']['loss_at_max'],
        'test_max_acc': results['test']['max_acc'],
        'test_round': results['test']['round'],
        'test_loss': results['test']['loss_at_max'],
    }

    file_exists = os.path.exists(results_csv_path)

    with open(results_csv_path, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=row.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

rng = np.random.default_rng(seed)

def run_federated_training(
    global_model,
    optimizer,
    criterion,
    client_loaders,
    train_loader_all,
    val_loader,
    test_loader,
    shards,
    params,
    CKPT_DIR="./fl_checkpoints",
    name="fed"
):
    # Prepare CSV logging path per gamma
    csv_path=f'./fedavg_results.csv'

    gamma     = params['GAMMA']
    K         = params['K']
    C         = params['C']
    J         = params['J']
    ROUNDS    = params['ROUNDS']
    BS        = params['batch_size']
    LR        = params['lr']
    WD        = params['weight_decay']
    momentum  = params['momentum']
    


    csv_path_res = f'./'+name+'_results.csv'
    if not os.path.exists(csv_path):
        with open(csv_path, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['round', 'val_loss', 'val_acc', 'test_loss', 'test_acc','train_loss', 'train_acc'])



    start_round = load_checkpoint(global_model, optimizer, CKPT_DIR, resume=True,name=name)
    best_accuracy = 0

    print(f"[Training] Starting gamma = {gamma} from round {start_round} to {ROUNDS}")

    best_model_state = None
    best_round=1

    for rnd in range(start_round, ROUNDS + 1):
        # 1) Sample clients
        m = max(1, int(C * K)) # → 10 clients per round when K=100, C=0.1

        selected, _ = sample_clients_dirichlet(K, m, gamma, rng )

        local_states, sizes = [], []
        for i in selected:
            # 2a) Copy global model
            client_model = copy.deepcopy(global_model)
            client_model.train()   ####to delete???
            client_opt = optim.SGD(client_model.parameters(), lr=LR, momentum=0.9, weight_decay=WD)
            # 2b) Local training for J epochs
            for _ in range(J):
                train_one_epoch(client_model, client_opt, criterion, client_loaders[i])
            local_states.append(client_model.state_dict())
            sizes.append(len(shards[i]))
        # 3) Weighted aggregation
        total_size = sum(sizes)
        new_state = {
            k: sum((sizes[j] / total_size) * local_states[j][k] for j in range(len(sizes)))
            for k in global_model.state_dict()
        }
        global_model.load_state_dict(new_state)
         # 4) Global evaluation
        train_loss, train_acc = eval_model(global_model, criterion, train_loader_all)
        val_loss, val_acc = eval_model(global_model, criterion, val_loader)
        test_loss, test_acc = eval_model(global_model, criterion, test_loader)



        with open(csv_path, 'a', newline='') as f:
            csv.writer(f).writerow([
                rnd,
                f"{val_loss:.4f}", f"{val_acc:.4f}",
                f"{test_loss:.4f}", f"{test_acc:.4f}",
                 f"{train_loss:.4f}",  f"{train_acc:.4f}"
            ])



        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_model_state = copy.deepcopy(global_model.state_dict())
            best_round=rnd


        # if val_acc > best_accuracy:
        #     best_accuracy = val_acc
        #     save_checkpoint(global_model, optimizer, rnd, CKPT_DIR, is_best=True)

        if rnd % 3 == 0 or rnd == 1:
             print(f"Checkpointing round {rnd} (gamma={gamma})")
             save_checkpoint(global_model, optimizer, rnd, CKPT_DIR,personalized_par_string=name, is_best=False)
             print(f"Round {rnd}/{ROUNDS} | Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}")


    if best_model_state is not None:
        global_model.load_state_dict(best_model_state)  # Load best weights back into the model
        save_checkpoint(
            model=global_model,
            optimizer=optimizer,
            round_num=best_round,
            ckpt_dir=CKPT_DIR,
            personalized_par_string=name,
            is_best=True
        )

    clean_results_history(csv_path,csv_path_res)


    write_final_results(name,params,csv_path_res)







    # if best_model_state is not None:
    #     model_path = os.path.join(CKPT_DIR, f"best_model_gamma_{gamma}.pth")
    #     torch.save(best_model_state, model_path)
    #     print(f"[Saved] Best model for gamma={gamma} with Val Acc = {best_accuracy:.4f} → {model_path}")


In [10]:

from collections import defaultdict
from torch.utils.data import Subset

def create_labelwise_shards(dataset, K, Nc, seed=42):
    # 1) Group indices by label
    label2idx = defaultdict(list)
    for idx, (_, lbl) in enumerate(dataset):
        label2idx[lbl].append(idx)

    # 2) Shuffle each label’s pool
    rng = random.Random(seed)
    for lbl in label2idx:
        rng.shuffle(label2idx[lbl])

    # 3) Prepare an iterator per label
    pointers = {lbl: 0 for lbl in label2idx}

    # 4) Build shards
    samples_per_client = len(dataset) // K
    shards_idx = []
    labels_cycle = list(label2idx.keys())

    for client_id in range(K):
        client_idxs = []
        # Rotate start point so clients don’t always pick the same first label
        rng.shuffle(labels_cycle)
        for lbl in labels_cycle:
            if len(client_idxs) >= samples_per_client:
                break
            # How many to take from this label
            needed = samples_per_client - len(client_idxs)
            available = len(label2idx[lbl]) - pointers[lbl]
            take = min(needed, available)
            if take > 0:
                start = pointers[lbl]
                end   = start + take
                client_idxs.extend(label2idx[lbl][start:end])
                pointers[lbl] += take
        # If we still haven’t reached samples_per_client (rare), fill randomly
        if len(client_idxs) < samples_per_client:
            all_remaining = [i for lbl in label2idx
                                 for i in label2idx[lbl][pointers[lbl]:]]
            client_idxs.extend(rng.sample(all_remaining,
                                          samples_per_client - len(client_idxs)))
        shards_idx.append(client_idxs)

    return [Subset(dataset, idxs) for idxs in shards_idx]


In [None]:

# ── A. Mount Google Drive ─────────────────────────────────────────────────────
import os

try:
    from google.colab import drive
    # If import succeeds, we are likely in Colab
    IN_COLAB = True
except ImportError:
    # If import fails, we are likely not in Colab
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive')
    CKPT_DIR = '/content/drive/MyDrive/fl_checkpoints'
else:
    CKPT_DIR = './fl_checkpoints'
os.makedirs(CKPT_DIR, exist_ok=True)




# Set this to True to resume from the last checkpoint; False to start from scratch
RESUME = True



# ── A. FedAvg Hyperparameters ───────────────────────────────────────────────────
# Fixed FL parameters
K      = 100      # total clients
C      = 0.1      # fraction of clients sampled per round
J      = 4        # local epochs per client
ROUNDS = 200  #2000     # total communication rounds

# Optimizer hyperparameters (constant, no schedule)
LR     = 0.01     # fixed learning rate
WD     = 1e-4     # weight decay
BS     = 128      # per-client batch size
momentum=0.9

#gamma value for dirichlet client selection


budget   = J * ROUNDS




# Before your FedAvg loop: Instantiate the global model, loss, and client loaders once

# Instantiate model, optimizer, loss, and client loaders
global_model   = LELeNetCIFAR().to(device)
optimizer      = optim.SGD(global_model.parameters(), lr=LR, momentum=momentum, weight_decay=WD)
criterion      = nn.CrossEntropyLoss()
client_loaders = [
    DataLoader(shards[i], batch_size=BS, shuffle=True, num_workers=2)
    for i in range(K)
]  ##should i out in the for loop below ?







gammas = [0.01]   #[0.01, 0.1, 0.5, 1, 10, 100]



base = train_size // K
sizes = [base]*(K-1) + [train_size-base*(K-1)]
iid_shards = random_split(train_dataset, sizes,
                        generator=torch.Generator().manual_seed(seed))

Nc_list   = [1, 5, 10, 50]
shardings = {'iid': iid_shards}

for Nc in Nc_list:
    shardings[f'non_iid_{Nc}'] = create_labelwise_shards(
            train_dataset, K, Nc, seed
    )


#gamma value for dirichlet client selection
gamma='uniform'   #gamma uniform since i'm not testing that 
J_list   = [4, 8, 16]
for shard_key, shards in shardings.items():
    client_loaders = [
        DataLoader(shards[i], batch_size=BS, shuffle=True, num_workers=2)
        for i in range(K)
    ]
    for J in J_list: #the other J that is outside , it's used onlu to calc the budget

    

        ROUNDS = budget // J  ##rounds scaled 
        # Reinitialize model and optimizer per gamma
        model = LELeNetCIFAR().to(device)
        opt = optim.SGD(model.parameters(), lr=LR, momentum=momentum, weight_decay=WD)
        loss_fn = nn.CrossEntropyLoss()
        params= {   ##probably needed to differenciate parameter between client and gloabal model
        'lr':           LR,
        'weight_decay': WD,
        'batch_size':   BS,
        'K':            K,
        'C':            C,
        'J':            J,
        'ROUNDS':       ROUNDS,
        'GAMMA':gamma,
        "momentum": momentum
        }

        ##new_part


        ##new_part

            
        run_federated_training(
            name="hetetogeneous_shard_{}_J_{}".format(shard_key,J),
            global_model=model,
            optimizer=opt,
            criterion=loss_fn,
            client_loaders=client_loaders,
            val_loader=val_loader,
            test_loader=test_loader,
            train_loader_all=train_loader_all,
            shards=shards,
            CKPT_DIR=CKPT_DIR,params=params
        )


[Checkpoint] No checkpoint found; training from scratch.
[Training] Starting gamma = uniform from round 1 to 200
Checkpointing round 1 (gamma=uniform)
[Checkpoint] Saving round 1...
[Checkpoint] Done saving to last_ckpt_hetetogeneous_shard_iid_J_4_round_1.pth
Round 1/200 | Val Acc: 0.0082, Test Acc: 0.0092
Checkpointing round 3 (gamma=uniform)
[Checkpoint] Saving round 3...
[Checkpoint] Done saving to last_ckpt_hetetogeneous_shard_iid_J_4_round_3.pth
  Deleted: last_ckpt_hetetogeneous_shard_iid_J_4_round_1.pth
Round 3/200 | Val Acc: 0.0124, Test Acc: 0.0101
Checkpointing round 6 (gamma=uniform)
[Checkpoint] Saving round 6...
[Checkpoint] Done saving to last_ckpt_hetetogeneous_shard_iid_J_4_round_6.pth
  Deleted: last_ckpt_hetetogeneous_shard_iid_J_4_round_3.pth
Round 6/200 | Val Acc: 0.0230, Test Acc: 0.0211


In [None]:
import pandas as pd

df = pd.read_csv('global_results.csv')
print(df.to_string(index=False))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# List of gamma values and corresponding file names
gammas = [0.01,0.1, 0.5, 1, 10]
base_path = './fedavg_results_gamma_{}.csv'

# Set up the plot
plt.figure(figsize=(12, 6))

# Load and plot each CSV
for gamma in gammas:
    file_path = base_path.format(gamma)
    df = pd.read_csv(file_path)

    plt.plot(df['round'], df['test_acc'], label=f'γ = {gamma}')
    print(f"Gamma {gamma}: Max Test Acc = {df['test_acc'].max():.4f}")


# Configure plot
plt.xlabel('Federated Round')
plt.ylabel('Global Test Accuracy')
plt.title('FedAvg CIFAR-100: Test Accuracy vs Rounds for Different γ')
plt.grid(True)
plt.legend(title="Gamma")
plt.tight_layout()
plt.show()
