In [1]:
# ── A. Imports & Reproducibility ────────────────────────────────────────────────
import os
import csv                                                  # For result logging :contentReference[oaicite:0]{index=0}
import random                                               # For seeding :contentReference[oaicite:1]{index=1}
import numpy as np                                          # For numeric ops :contentReference[oaicite:2]{index=2}
import torch                                               # Core PyTorch :contentReference[oaicite:3]{index=3}
import torch.nn as nn                                       # Neural-net modules :contentReference[oaicite:4]{index=4}
import torch.nn.functional as F                             # Functional API :contentReference[oaicite:5]{index=5}
import torch.optim as optim                                 # Optimizers :contentReference[oaicite:6]{index=6}
from torch.optim.lr_scheduler import CosineAnnealingLR      # Scheduler :contentReference[oaicite:7]{index=7}
from torch.utils.data import DataLoader, random_split       # Data loaders & splits :contentReference[oaicite:8]{index=8}
import torchvision                                          # Datasets & transforms :contentReference[oaicite:9]{index=9}
import torchvision.transforms as T                          # Transforms :contentReference[oaicite:10]{index=10}
from torch.utils.tensorboard import SummaryWriter           # TensorBoard logging :contentReference[oaicite:11]{index=11}
import matplotlib.pyplot as plt                             # Plotting :contentReference[oaicite:12]{index=12}

In [2]:
# Seed everything for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


In [3]:
# ── B. Device ───────────────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")                             # Confirm GPU vs CPU :contentReference[oaicite:13]{index=13}


Using device: cuda


In [4]:
# ── C. Data Preparation ─────────────────────────────────────────────────────────
# Transforms
transform_train = T.Compose([
    T.RandomCrop(32, padding=4), T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761)),
])
transform_test = T.Compose([
    T.ToTensor(),
    T.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761)),
])


In [5]:
# Download & train/val/test split
dataset_full = torchvision.datasets.CIFAR100(
    root='./data', train=True, download=True, transform=transform_train)
val_size = 5000
train_size = len(dataset_full) - val_size
train_dataset, val_dataset = random_split(
    dataset_full, [train_size, val_size],
    generator=torch.Generator().manual_seed(seed))
test_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=False, download=True, transform=transform_test)

100%|██████████| 169M/169M [00:49<00:00, 3.42MB/s] 


In [6]:
# ── D. Model Definition ─────────────────────────────────────────────────────────
class LELeNetCIFAR(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=5, padding=2)
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(2)
        self.fc1   = nn.Linear(64*8*8, 384)
        self.fc2   = nn.Linear(384, 192)
        self.fc3   = nn.Linear(192, 100)
    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x)); x = F.relu(self.fc2(x))
        return self.fc3(x)

In [7]:
# ── E. Utilities: Train/Eval & Checkpointing ────────────────────────────────────
def train_one_epoch(model, optimizer, criterion, loader):
    model.train()
    running_loss = correct = total = 0
    for imgs, lbls in loader:
        imgs, lbls = imgs.to(device), lbls.to(device)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, lbls)
        loss.backward(); optimizer.step()
        running_loss += loss.item()*imgs.size(0)
        correct += out.argmax(1).eq(lbls).sum().item()
        total += lbls.size(0)
    return running_loss/total, correct/total

def eval_model(model, criterion, loader):
    model.eval()
    running_loss = correct = total = 0
    with torch.no_grad():
        for imgs, lbls in loader:
            imgs, lbls = imgs.to(device), lbls.to(device)
            out = model(imgs); loss = criterion(out, lbls)
            running_loss += loss.item()*imgs.size(0)
            correct += out.argmax(1).eq(lbls).sum().item()
            total += lbls.size(0)
    return running_loss/total, correct/total

In [8]:
""" # Checkpoint saves model + optimizer + scheduler + RNG
ckpt_dir = './checkpoints'
os.makedirs(ckpt_dir, exist_ok=True)
def save_checkpoint(model, optimizer, scheduler, epoch, is_best=False):
    fname = f"{'best' if is_best else 'last'}_ckpt_epoch_{epoch}.pth"
    torch.save({
        'epoch': epoch,
        'model_state': model.state_dict(),
        'optim_state': optimizer.state_dict(),
        'sched_state': scheduler.state_dict(),
        'rng_state': torch.get_rng_state(),
    }, os.path.join(ckpt_dir, fname)) """

' # Checkpoint saves model + optimizer + scheduler + RNG\nckpt_dir = \'./checkpoints\'\nos.makedirs(ckpt_dir, exist_ok=True)\ndef save_checkpoint(model, optimizer, scheduler, epoch, is_best=False):\n    fname = f"{\'best\' if is_best else \'last\'}_ckpt_epoch_{epoch}.pth"\n    torch.save({\n        \'epoch\': epoch,\n        \'model_state\': model.state_dict(),\n        \'optim_state\': optimizer.state_dict(),\n        \'sched_state\': scheduler.state_dict(),\n        \'rng_state\': torch.get_rng_state(),\n    }, os.path.join(ckpt_dir, fname)) '

In [9]:
import glob, torch, os


def latest_ckpt(dirpath, pattern="last_ckpt__round_*.pth"):
    paths = glob.glob(os.path.join(dirpath, pattern))
    if not paths:
        return None
    paths.sort(key=lambda p: int(p.rsplit("_", 1)[1].split(".")[0]))
    return paths[-1]


def load_checkpoint(model, optimizer, ckpt_dir,scheduler, resume=True,name=""):
    if not resume:
        print("[Checkpoint] Starting training from scratch.")
        return 1
    if name:
        pattern='last_ckpt_'+name+'_round_*.pth'
        ckpt_path = latest_ckpt(ckpt_dir,pattern)
    else:
        ckpt_path = latest_ckpt(ckpt_dir)
    if ckpt_path is None:
        print("[Checkpoint] No checkpoint found; training from scratch.")
        return 1
    # Load checkpoint tensors onto CPU to preserve RNG state tensor
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt['model_state'])
    optimizer.load_state_dict(ckpt['optimizer_state'])
    scheduler.load_state_dict(ckpt['sched_state'])
    # Restore CPU RNG state
    rng_state = ckpt['rng_state']
    if rng_state.device.type != 'cpu':
        rng_state = rng_state.cpu()
    torch.set_rng_state(rng_state)
    print(f"[Checkpoint] Resumed from round {ckpt['round']} (loaded {os.path.basename(ckpt_path)})")
    return ckpt['round'] + 1


def save_checkpoint(model, optimizer,scheduler, round_num, ckpt_dir,personalized_par_string="", is_best=False):
    print(f"[Checkpoint] Saving round {round_num}...")
    state = {
        'round': round_num,
        'model_state': model.state_dict(),
        'optimizer_state': optimizer.state_dict(),
          'sched_state': scheduler.state_dict(),
        'rng_state': torch.get_rng_state(),
    }
    fname = f"{'best' if is_best else 'last'}_ckpt_{personalized_par_string}_round_{round_num}.pth"
    half_name=f"last_ckpt_{personalized_par_string}_round_"
    if is_best:
        torch.save(model.state_dict(), os.path.join(ckpt_dir,fname))
    else:
            torch.save(state, os.path.join(ckpt_dir, fname))
            for existing in os.listdir(ckpt_dir):
                existing_path = os.path.join(ckpt_dir, existing)
                if (
                    existing.endswith('.pth')
                    and existing != fname
                    and 'best' not in existing
                    and half_name in existing
                ):
                    os.remove(existing_path)
                    print(f"  Deleted: {existing}")
    print(f"[Checkpoint] Done saving to {fname}")


In [32]:
###always copy this for logging ::::::::::::::::::::::::::::::::::::::::::::::::::
import os, csv,json
from datetime import datetime
import pandas as pd

    
def log_results(name, rnd,maxround, val_loss, val_acc, test_loss, test_acc, train_loss, train_acc,local_train_mean,local_train_std, csv_path='results_log.csv',csv_path_final='global_results.csv',params={}):
        file_exists = os.path.exists(csv_path)
        if not file_exists:
            with open(csv_path, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['name','round', 'val_loss', 'val_acc', 'test_loss', 'test_acc','train_loss', 'train_acc','local_train_mean','local_train_std'])
        with open(csv_path, 'a', newline='') as f:
            csv.writer(f).writerow([
                name,
                rnd,
                f"{val_loss:.4f}", f"{val_acc:.4f}",
                f"{test_loss:.4f}", f"{test_acc:.4f}",
                 f"{train_loss:.4f}",  f"{train_acc:.4f}",
                 f"{local_train_mean:.4f}",  f"{local_train_std:.4f}",

            ])
        

        if rnd==maxround:
                csv_path_res='results_only_'+name+'.csv'
                clean_results_history(csv_path,csv_path_res,name)
                write_final_results(name,params,csv_path_res,csv_path_final)

def clean_results_history(results_file_name,new_file_name,name):   #da fare eliminare righe che vengono prima della successiva 
    input_file = results_file_name
    output_file=new_file_name


    # Read and clean lines
    with open(input_file, "r") as f:
        lines = [line.strip() for line in f if line.strip()]

    filtered = []
    last_seen_index = float('inf')  # Start with a very large number
    header = lines[0]
    data_lines = lines[1:]

    # Iterate in reverse
    for line in reversed(data_lines):
        current_index = int(line.split(',')[1])
        if current_index < last_seen_index and (line.split(',')[0]==name):
            filtered.append(line)
            last_seen_index = current_index
        else:
            # Skip this line, as its index is higher than the next one
            pass

    # Reverse again to restore original order (except removed lines)
    filtered.reverse()

    file_exists = os.path.exists(output_file)
    # Write to output


    if os.path.exists(new_file_name):
        with open(new_file_name, "r") as f:
            history_lines = [line.strip() for line in f if line.strip()]
        history_header = history_lines[0]
        history_data = history_lines[1:]
    else:
        history_header = header
        history_data = []

    # --- Remove from history any round that will be updated ---
    new_rounds_set = {int(line.split(',')[1]) for line in filtered}
    updated_history_data = [
        line for line in history_data
        if int(line.split(',')[1]) not in new_rounds_set
    ]

    # --- Merge history and new data ---
    merged_data = updated_history_data + filtered

    # --- Write back ---
    with open(new_file_name, "w") as f:
        f.write(history_header + "\n")
        f.write("\n".join(merged_data))
        f.write("\n")

    print(f"Filtered and merged results written to {new_file_name}")




def get_results(csv_path):
    df = pd.read_csv(csv_path)

    results = {}

    for split in ['train', 'val', 'test']:
        acc_col = f"{split}_acc"
        loss_col = f"{split}_loss"

        max_acc = df[acc_col].max()
        max_idx = df[acc_col].idxmax()

        max_round = df.loc[max_idx, 'round']
        loss_at_max = df.loc[max_idx, loss_col]

        results[split] = {
            'max_acc': max_acc,
            'round': int(max_round),
            'loss_at_max': loss_at_max
        }

    return results





def write_final_results(name, params, csv_path='results_log.csv', results_csv_path='global_results.csv'):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    results = get_results(csv_path)

    row = {
        'timestamp': timestamp,
        'model_name': name,
        'parameters': json.dumps(params),
        'train_max_acc': results['train']['max_acc'],
        'train_round': results['train']['round'],
        'train_loss': results['train']['loss_at_max'],
        'val_max_acc': results['val']['max_acc'],
        'val_round': results['val']['round'],
        'val_loss': results['val']['loss_at_max'],
        'test_max_acc': results['test']['max_acc'],
        'test_round': results['test']['round'],
        'test_loss': results['test']['loss_at_max'],
    }

    file_exists = os.path.exists(results_csv_path)

    with open(results_csv_path, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=row.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

In [29]:
import os

try:
    from google.colab import drive
    # If import succeeds, we are likely in Colab
    IN_COLAB = True
except ImportError:
    # If import fails, we are likely not in Colab
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive')
    CKPT_DIR = '/content/drive/MyDrive/fl_checkpoints'
else:
    CKPT_DIR = './fl_checkpoints'
os.makedirs(CKPT_DIR, exist_ok=True)


In [35]:
# ── A. Hyperparameter Grid ─────────────────────────────────────────────────────
param_grid = [
    {'lr': 0.1, 'weight_decay': 1e-4, 'batch_size': 128, 'epochs': 100},
    # … add more combinations as needed …
]

# Only testing new schedulers here
scheduler_classes = [
    torch.optim.lr_scheduler.OneCycleLR,
    torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
]

for sched_class in scheduler_classes:
    sched_name = f"cifar2_scheduler_{sched_class.__name__}"

    for cfg in param_grid:
        lr, wd, bs, epochs = cfg['lr'], cfg['weight_decay'], cfg['batch_size'], cfg['epochs']
        print(f"\n▶ Running {sched_name} with config: lr={lr}, wd={wd}, bs={bs}, epochs={epochs}")

        # Fresh model/optimizer/etc. for each config
        model     = LELeNetCIFAR().to(device)
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd)
        criterion = nn.CrossEntropyLoss()

        # Re-create DataLoaders per batch size
        train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=2)
        val_loader   = DataLoader(val_dataset,   batch_size=bs, shuffle=False, num_workers=2)
        test_loader  = DataLoader(test_dataset,  batch_size=bs, shuffle=False, num_workers=2)

        # Instantiate scheduler AFTER train_loader is created
        if sched_class == torch.optim.lr_scheduler.OneCycleLR:
            scheduler = sched_class(
                optimizer,
                max_lr=lr,
                steps_per_epoch=len(train_loader),
                epochs=epochs
            )
        elif sched_class == torch.optim.lr_scheduler.CosineAnnealingWarmRestarts:
            scheduler = sched_class(optimizer, T_0=10, T_mult=2)
        else:
            scheduler = sched_class(optimizer)  # fallback

        start_round = load_checkpoint(model, optimizer, CKPT_DIR, scheduler, resume=True, name=sched_name)

        writer = SummaryWriter(log_dir=f'./logs/{sched_name}_lr{lr}_wd{wd}_bs{bs}_ep{epochs}')

        best_val_acc = 0.0
        for epoch in range(start_round, epochs + 1):
            # Training loop (modified for OneCycleLR per-batch stepping)
            model.train()
            train_loss, correct, total = 0.0, 0, 0
            for batch_idx, (inputs, targets) in enumerate(train_loader):
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                # LR scheduling per-batch if OneCycleLR
                if isinstance(scheduler, torch.optim.lr_scheduler.OneCycleLR):
                    scheduler.step()

                train_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

            train_acc = correct / total
            train_loss /= len(train_loader)

            # Validation + test
            val_loss,   val_acc   = eval_model(model, criterion, val_loader)
            test_loss, test_acc   = eval_model(model, criterion, test_loader)

            log_results(sched_name, epoch, epochs, val_loss, val_acc, test_loss, test_acc,
                        train_loss, train_acc, -1, -1, params=cfg)

            # Epoch-based scheduler stepping
            if isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
                scheduler.step(epoch)

            # TensorBoard logging
            writer.add_scalars('Loss', {'train': train_loss, 'val': val_loss}, epoch)
            writer.add_scalars('Acc',  {'train': train_acc,  'val': val_acc}, epoch)

            # Save checkpoints
            if epoch % 10 == 0 or epoch == epochs:
                save_checkpoint(model, optimizer, scheduler, epoch, is_best=False,
                                ckpt_dir=CKPT_DIR, personalized_par_string=sched_name)
            if val_acc > best_val_acc:
                best_val_acc = val_acc

            print(f"  Epoch {epoch}/{epochs}  train_acc={train_acc:.4f}  val_acc={val_acc:.4f}")

        # Final Test Evaluation
        test_loss, test_acc = eval_model(model, criterion, test_loader)
        print(f"Config {cfg} with {sched_name} → best_val_acc={best_val_acc:.4f}, test_acc={test_acc:.4f}")



▶ Running cifar2_scheduler_OneCycleLR with config: lr=0.1, wd=0.0001, bs=128, epochs=100
[Checkpoint] No checkpoint found; training from scratch.
  Epoch 1/100  train_acc=0.0478  val_acc=0.0840
  Epoch 2/100  train_acc=0.1112  val_acc=0.1392
  Epoch 3/100  train_acc=0.1646  val_acc=0.1866
  Epoch 4/100  train_acc=0.2087  val_acc=0.2274
  Epoch 5/100  train_acc=0.2447  val_acc=0.2548
  Epoch 6/100  train_acc=0.2729  val_acc=0.2766
  Epoch 7/100  train_acc=0.2953  val_acc=0.3024
  Epoch 8/100  train_acc=0.3117  val_acc=0.3112
  Epoch 9/100  train_acc=0.3218  val_acc=0.3220
[Checkpoint] Saving round 10...
[Checkpoint] Done saving to last_ckpt_cifar2_scheduler_OneCycleLR_round_10.pth
  Epoch 10/100  train_acc=0.3246  val_acc=0.3164
  Epoch 11/100  train_acc=0.3366  val_acc=0.3310
  Epoch 12/100  train_acc=0.3317  val_acc=0.2970
  Epoch 13/100  train_acc=0.3389  val_acc=0.3166
  Epoch 14/100  train_acc=0.3417  val_acc=0.3102
  Epoch 15/100  train_acc=0.3368  val_acc=0.3142
  Epoch 16/100  

In [15]:
# ── Configuration Summary Cell ──────────────────────────────────────────────────
import torch, torchvision, sys, platform, time, os
import numpy as np
import random
from torch.utils.tensorboard import SummaryWriter

def summarize_run(cfg, train_loader, val_loader, test_loader, writer=None):
    """
    Print and log a full summary of the current run configuration and environment.

    Args:
        cfg (dict): Hyperparameter dict with 'lr', 'weight_decay', 'batch_size', 'epochs', etc.
        train_loader, val_loader, test_loader: DataLoaders for computing dataset sizes.
        writer (SummaryWriter, optional): if provided, logs summary to TensorBoard under 'RunInfo'.
    """
    # 1. Timestamp
    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

    # 2. Environment
    env_info = {
        'python_version': sys.version.split()[0],
        'pytorch_version': torch.__version__,
        'torchvision_version': torchvision.__version__,
        'cuda_available': torch.cuda.is_available(),
        'cuda_device': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only',
        'device_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
        'platform': platform.platform(),
        'cwd': os.getcwd(),
    }

    # 3. Data sizes
    data_info = {
        'train_samples': len(train_loader.dataset),
        'val_samples': len(val_loader.dataset),
        'test_samples': len(test_loader.dataset),
        'batch_size': cfg.get('batch_size'),
        'num_batches_train': len(train_loader),
        'num_batches_val': len(val_loader),
        'num_batches_test': len(test_loader),
    }

    # 4. Seed & Hyperparams
    seed_info = {
        'seed': cfg.get('seed', 'not set'),
    }
    hyperparams = {k: v for k, v in cfg.items() if k not in seed_info}

    # 5. Print Summary
    print(f"{'='*20} RUN SUMMARY ({ts}) {'='*20}\n")
    print("➜ Environment:")
    for k, v in env_info.items():
        print(f"    • {k}: {v}")
    print("\n➜ Data:")
    for k, v in data_info.items():
        print(f"    • {k}: {v}")
    print("\n➜ Seed:")
    for k, v in seed_info.items():
        print(f"    • {k}: {v}")
    print("\n➜ Hyperparameters:")
    for k, v in hyperparams.items():
        print(f"    • {k}: {v}")
    print(f"\n{'='*60}\n")

    # 6. Optional TensorBoard Logging
    if writer is not None:
        for k, v in {**env_info, **data_info, **seed_info, **hyperparams}.items():
            # Non-numeric values will be logged as text under a scalar tag
            try:
                writer.add_text('RunInfo/' + k, str(v), 0)
            except Exception:
                pass

# ── Example Usage ────────────────────────────────────────────────────────────────
# After defining `cfg`, DataLoaders, and `writer` in your Run Cell, just call:
summarize_run(cfg, train_loader, val_loader, test_loader, writer)


NameError: name 'cfg' is not defined

In [None]:
import pandas as pd
df = pd.read_csv('./results_grid.csv')
display(df.sort_values('test_acc', ascending=False))


In [None]:
# ── Final Analysis & Plotting ────────────────────────────────────────────────────
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load results
csv_path = './results_grid.csv'
df = pd.read_csv(csv_path)

# 2. Display top 5 configs by test accuracy
top5 = df.sort_values('test_acc', ascending=False).head(5)
print("Top 5 hyperparameter configurations:")
display(top5)

# 3. Bar plot of test accuracy for each config
plt.figure(figsize=(10, 6))
plt.bar(
    x=range(len(df)),
    height=df['test_acc'],
    tick_label=[f"lr={lr}\nwd={wd}\nbs={bs}\nep={ep}"
                for lr, wd, bs, ep in zip(df['lr'], df['weight_decay'], df['batch_size'], df['epochs'])]
)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Test Accuracy')
plt.title('Grid Search Results: Test Accuracy by Hyperparameter Configuration')
plt.tight_layout()
plt.show()
