In [None]:
import torchvision
import torch
import torchvision.transforms as transforms
import random
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchvision import transforms
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
import time
import os
import torch
import copy
import wandb
import json
import pandas as pd

# Seeds for reproducibility
def set_seed(seed: int = 123):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(123)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
""" PLOT SETTINGS """

plt.style.use("seaborn-v0_8")
plt.rcParams.update({
    "font.size": 18,            # base font size
    "axes.titlesize": 24,       # axis titles
    "axes.labelsize": 22,       # axis labels
    "xtick.labelsize": 18,      # X axis numbers
    "ytick.labelsize": 18,      # Y axis numbers
    "legend.fontsize": 20,      # legend text
    "lines.linewidth": 3.0      # line thickness
})


In [None]:
""" CLASS TO CREATE THE HEAD with 100 CLASSES """
class DINOWithHead(nn.Module):
    def __init__(self, backbone, num_classes=100, p=None):
        super().__init__()
        self.backbone = backbone
        layers = []
        if p is not None:
            layers.append(nn.Dropout(p=p))
        layers.append(nn.Linear(384, num_classes))
        self.head = nn.Sequential(*layers)

    def forward(self, x):
        features = self.backbone(x)
        return self.head(features)

### Dataset Download and Transformations

In [None]:
""" DATASET DOWNLOAD """

ROOT = './data'
BATCH_SIZE = 64
#BATCH_SIZE = 128
NUM_WORKERS = os.cpu_count()

tot_train_data = torchvision.datasets.CIFAR100(root=ROOT, train=True, download=True, transform=torchvision.transforms.ToTensor())
test_data = torchvision.datasets.CIFAR100(root=ROOT, train=False, download=True, transform=torchvision.transforms.ToTensor())

100%|██████████| 169M/169M [00:03<00:00, 48.0MB/s]


In [None]:
""" SPLIT TOT_TRAININ in VALIDATION and TRAIN """

def split_dataset(tot_train_data, valid_ratio=0.8):
    """
    Splits the given dataset randomly into training and validation subsets.
    """
    train_size = int(valid_ratio * len(tot_train_data))
    val_size = len(tot_train_data) - train_size
    train_data, val_data = random_split(tot_train_data, [train_size, val_size])
    return train_data, val_data

train_data, val_data = split_dataset(tot_train_data, valid_ratio=0.8)

In [None]:
""" DATA TRANSFORMATION """

def data_trasform(dataset, data_augmentation=False):   ### train_data or tot_train_data
    """
    Returns train and val/test transforms based on dataset stats.
    Dataset (for computing mean and std) can be either training only or combined train+validation.

    If data_augmentation=True, applies augmentation on training transforms, otherwise only resize and normalize.
    """

    # MEAN and VARIANCE (considering 3 channels)
    mean = torch.zeros(3)
    std = torch.zeros(3)
    nb_samples = 0

    for img, _ in dataset:
        img = img.view(3, -1)  # Flatten H*W in seconda dimensione
        mean += img.mean(1)
        std += img.std(1)
        nb_samples += 1

    mean /= nb_samples
    std /= nb_samples


    if data_augmentation:
        train_transforms = transforms.Compose([
            transforms.Resize(64, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.RandomCrop(64, padding=4),
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(degrees=15),
            transforms.RandAugment(num_ops=2, magnitude=9),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])
    else:
        train_transforms = transforms.Compose([
            transforms.Resize(64, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])

    ### NO DATA AUGMENTATION for val/test!
    val_test_transforms = transforms.Compose([
        transforms.Resize(64),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)   # Normalization using the training statistics
    ])


    return train_transforms, val_test_transforms

In [None]:
""" DATA TRANSFORMATION and LOADERS """

### ===== For hyperparameter tuning considering train_data and val_data =====
train_transforms, val_test_transforms = data_trasform(train_data)

train_data.dataset.transform = train_transforms
val_data.dataset.transform = val_test_transforms

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader   = DataLoader(val_data,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)



### ===== For model testing considering tot_train_data and test_data =====
train_transforms, val_test_transforms = data_trasform(tot_train_data)

tot_train_data = torchvision.datasets.CIFAR100(root=ROOT, train=True, download=False, transform=train_transforms)
test_data = torchvision.datasets.CIFAR100(root=ROOT, train=False, download=False, transform=val_test_transforms)

tot_train_loader = DataLoader(tot_train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

### Test and Training Function

In [None]:
""" TRAINING and TESTING """

def evaluate_model(model, data_loader, criterion):
    """
    The evaluate_model function computes the average loss and accuracy of a model on a dataset.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    total_loss = 0.0
    total_corrects = 0

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            total_corrects += torch.sum(preds == labels.data)

    avg_loss = total_loss / len(data_loader.dataset)
    avg_acc = total_corrects.double() / len(data_loader.dataset)
    return avg_loss, avg_acc.item()




def save_checkpoint(model, optimizer, scheduler, epoch, train_losses, train_accuracies,
                    val_test_losses, val_test_accuracies, best_acc, best_loss, best_model_wts, path):
    """
    The save_checkpoint function saves the model’s state, optimizer, scheduler, training/validation metrics,
    and best performance to a specified file path.
    """
    dir_name = os.path.dirname(path)
    if dir_name:
        os.makedirs(dir_name, exist_ok=True)
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'val_test_losses': val_test_losses,
        'val_test_accuracies': val_test_accuracies,
        'best_acc': best_acc,
        'best_loss': best_loss,
        'best_model_state_dict': best_model_wts
    }
    torch.save(checkpoint, path)




def init_checkpoint(model, optimizer, scheduler, path=None, device='cpu'):
    """
    Initialize a checkpoint. If path is None, create default checkpoint with empty/default values.
    If path is given and file exists, load it.
    """
    if path is None:  # default path
        os.makedirs("checkpoints", exist_ok=True)
        path = "checkpoints/latest.pth"
        print(f"Initializing new checkpoint at {path}")
        checkpoint = {   # save default empty checkpoint
            'epoch': 1,
            'best_acc': 0.0,
            'best_loss': 1e10,
            'train_losses': [],
            'train_accuracies': [],
            'val_test_losses': [],
            'val_test_accuracies': [],
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
            'best_model_state_dict': copy.deepcopy(model.state_dict())
        }
        torch.save(checkpoint, path)
        return 1, 0.0, 1e10, [], [], [], [], path, copy.deepcopy(model.state_dict())

    else: # load existing checkpoint
        if not os.path.isfile(path):
            raise FileNotFoundError(f"Checkpoint file {path} does not exist.")

        print(f"Loading checkpoint from {path}")
        checkpoint = torch.load(path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        if scheduler and checkpoint.get('scheduler_state_dict'):
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        best_model_wts = checkpoint.get('best_model_state_dict', copy.deepcopy(model.state_dict()))
        return (checkpoint['epoch'],
                checkpoint.get('best_acc', 0.0),
                checkpoint.get('best_loss', 1e10),
                checkpoint.get('train_losses', []),
                checkpoint.get('train_accuracies', []),
                checkpoint.get('val_test_losses', []),
                checkpoint.get('val_test_accuracies', []),
                path,
                best_model_wts )




def train_test_model(model, criterion, optimizer, scheduler, train_loader, val_test_loader,
                          num_epochs=10, checkpoint_path=None, checkpoints = True, verbose = 1):
                                        # If checkpoint_path = None, a path is created and training starts from scratch
                                        # If checkpoints = False, we don't save anything (used for the calibration part)
    """
    Trains a model with logging and evaluation, returning the best model and metrics.
    If a checkpoint path is provided, training will resume from the saved state in that file.
    """

    since = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # === Checkpoints (Initialize if None, Load if it exists already) ===
    if checkpoints: # Training with checkpoints
        start_epoch, best_acc, best_loss, train_losses, train_accuracies, val_test_losses, val_test_accuracies, checkpoint_path, best_model_wts = \
            init_checkpoint(model, optimizer, scheduler, path=checkpoint_path, device=device)

    else: # No checkpoint
        start_epoch = 1
        best_acc = 0.0
        best_loss = 1e10
        train_losses, train_accuracies, val_test_losses, val_test_accuracies = [], [], [], []
        best_model_wts = copy.deepcopy(model.state_dict())

    # ===== Epoch loop =====
    for epoch in range(start_epoch, num_epochs+1):
        if checkpoints:
            if verbose:
                print(f'\nEpoch {epoch}/{num_epochs}')
                print('-' * 30)

        # ===== Training =====
        model.train()
        train_loss = 0.0
        train_corrects = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
            train_corrects += torch.sum(preds == labels.data)

        if scheduler:
            scheduler.step()

        epoch_train_loss = train_loss / len(train_loader.dataset)
        epoch_train_acc = train_corrects.double() / len(train_loader.dataset)

        if len(train_losses) >= epoch:     # Overwriting the current epoch's results if resuming from this position (if the current loop was never compelted)
            train_losses[epoch-1] = epoch_train_loss
            train_accuracies[epoch-1] = epoch_train_acc.item()
        else:
            train_losses.append(epoch_train_loss)
            train_accuracies.append(epoch_train_acc.item())

        if verbose == True:  # Print each round
            print(f'Train Loss: {epoch_train_loss:.4f} Train Acc: {epoch_train_acc:.4f}')


        # ===== Validation/Test =====
        epoch_val_test_loss, epoch_val_test_acc = evaluate_model(model, val_test_loader, criterion)

        if len(val_test_losses) >= epoch:    # Overwriting the current epoch's results if resuming from this position
            val_test_losses[epoch-1] = epoch_val_test_loss
            val_test_accuracies[epoch-1] = epoch_val_test_acc
        else:
            val_test_losses.append(epoch_val_test_loss)
            val_test_accuracies.append(epoch_val_test_acc)
        if verbose == True:
            print(f'Val/Test Loss: {epoch_val_test_loss:.4f}, Val/Test Acc: {epoch_val_test_acc:.4f}')

        if verbose == 'mid':
            if epoch == 1 or epoch % 5 == 0:   # Print occasionally
                print(f"Epoch {epoch}")
                print(f'Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.4f}')
                print(f'Val/Test Loss: {epoch_val_test_loss:.4f}, Val/Test Acc: {epoch_val_test_acc:.4f}')

        # ===== Update and Save the best model =====
        if epoch_val_test_acc > best_acc:
            best_acc = epoch_val_test_acc
            best_loss = epoch_val_test_loss
            best_model_wts = copy.deepcopy(model.state_dict())

        # ===== Save checkpoint =====
        if checkpoints:
            save_checkpoint(
                model, optimizer, scheduler,
                epoch + 1,
                train_losses, train_accuracies,
                val_test_losses, val_test_accuracies,
                best_acc, best_loss, best_model_wts,
                checkpoint_path
            )

    # Training completed
    time_elapsed = time.time() - since
    print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best Acc: {best_acc:.4f}, Best Loss: {best_loss:.4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)

    return model, train_losses, val_test_losses, train_accuracies, val_test_accuracies

# (2) CENTRALIZED MODEL with SPARSE FINE-TUNING

### Functions for Masking & Fine Tuning

The Fisher Information Matrix F is a matrix that quantifies the sensitivity of the model to its parameters. Parameters that are more important have higher values in F, while less important parameters have lower values. <br>
More precisely, it measures how the model's log-likelihood $ log p_\theta(y|x) $ changes when the parameters $\theta$ are modified. The Negative Log-Likelihood (NLL) is defined as: $$loss=−log p_\theta(y|x)$$ <br>
PyTorch computes the gradients of the loss with respect to all model parameters.
Instead of computing the full Fisher Information Matrix, only its diagonal is approximated by accumulating the squared gradients over the batches:
$$(\nabla_\theta \log p_\theta(y|x))^2$$



In [None]:
""" FISHER SCORES COMPUTATION, MASKING, FINE-TUNING """


def compute_fisher_scores(model, data_loader, device='cuda', max_batches = None):
    """
    Compute diagonal Fisher Information scores.
    """
    model.eval()
    fisher_scores = {name: torch.zeros_like(param, device="cpu")      # Initialize a dictionary to store Fisher Information scores
                     for name, param in model.named_parameters() if param.requires_grad}

    for i, (inputs, labels) in enumerate(data_loader):
        if max_batches is not None and i >= max_batches:    # Stop if a maximum number of batches is reached
            break
        inputs, labels = inputs.to(device), labels.to(device)

        model.zero_grad()   # Reset gradients
        outputs = model(inputs)  # Forward pass through the model

        log_probs = F.log_softmax(outputs, dim=1)   # Compute log-probabilities for each class
        sampled_y = torch.multinomial(log_probs.exp(), num_samples=1).squeeze(-1)     # Sample a label from the predicted distribution for each input
        loss = F.nll_loss(log_probs, sampled_y)    # Compute Negative Log-Likelihood loss with the sampled labels

        loss.backward()  # Backpropagate to compute gradients
        for name, param in model.named_parameters():
            if param.grad is not None:
                fisher_scores[name] += (param.grad.detach().cpu() ** 2)   # Accumulate squared gradients into the Fisher scores

    # Normalize by dataset size
    dataset_size = len(data_loader.dataset)
    for name in fisher_scores.keys():
        fisher_scores[name] /= dataset_size

    return fisher_scores


def calibrate_mask_centralized(model, train_loader, device='cuda', R=5, final_sparsity=0.9,
                               lr=0.01, keep='least', do_train=True, max_batches=50, criterion=None, J=1):
    """
    Progressive pruning mask for a centralized model using Fisher scores.
    - R: number of calibration rounds
    - final_sparsity: target fraction of parameters to prune
    - keep: 'least', 'most', or 'random'
    - do_train: whether to perform mini-training at each step
    - max_batches: limit for Fisher computation
    Returns: final mask (dict {param_name: tensor})
    """
    if criterion is None:
        criterion = torch.nn.CrossEntropyLoss()

    # Initial mask: all ones for the trainable parameters
    mask = {name: torch.ones_like(param, device="cpu", dtype=torch.uint8)
            for name, param in model.named_parameters() if param.requires_grad}

    # List of trainable parameters
    trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]

    for r in range(1, R + 1):   # Calibration rounds
        current_density = (1 - final_sparsity) ** (r / R)   # Represents the parameters to keep (mask==1)
        current_sparsity = 1 - current_density              # Represents the parameters to freeze (mask==0)
        print(f"[Mask Round {r}] Target Sparsity: {current_sparsity:.4f}")

        fisher_scores = compute_fisher_scores(model, train_loader, device=device, max_batches=max_batches)

        all_scores = torch.cat([score.view(-1) for score in fisher_scores.values()])   # Flatten all scores
        num_keep_global = int(len(all_scores) * current_density)
        num_keep_global = max(1, min(num_keep_global, len(all_scores)))       # Number of parameters to keep

        # topk returns the top k largest or smallest values in a tensor along with their indices.
        if keep == 'least':
            _, idx = torch.topk(all_scores, k=num_keep_global, largest=False)
        elif keep == 'most':
            _, idx = torch.topk(all_scores, k=num_keep_global, largest=True)
        elif keep == 'random':
            idx = torch.randperm(len(all_scores))[:num_keep_global]
        else:
            raise ValueError("keep must be 'least', 'most', or 'random'")

        global_keep = torch.zeros_like(all_scores, dtype=torch.bool)     # All 0
        global_keep[idx] = True                                          # True if to keep

        # --- Redistribute the mask layer per layer ---
        new_mask = {}    # To save the new updated mask
        start = 0
        for name, score in fisher_scores.items():
            numel = score.numel()     # Number tot. of elements

            # Selects the portion of the global boolean mask that corresponds to the current parameter
            keep_tensor = global_keep[start:start+numel].view_as(score)      # Return to the original form

            # Combining the newly selected elements (keep_tensor) with the previous mask, ensuring already frozen parameters remain inactive
            new_mask[name] = (keep_tensor.to(torch.uint8) * mask[name])
            start += numel  # Advances the start index for the next parameter

        mask = new_mask

        # Compute the total number of active parameters in the mask
        total_ones = sum(m.sum().item() for m in mask.values())
        total_params = sum(m.numel() for m in mask.values())
        perc_active = 100 * total_ones / total_params
        print(f"Total active parameters in the mask: {total_ones} / {total_params} ({perc_active:.2f}%)")

        if do_train: #Mini-training, using SparseSGD
            optimizer = SparseSGD(list(model.named_parameters()), lr=lr, mask=mask)
            model.train()

            J = 1 # If we want more then one training per calib.round J>1
            for _ in range(J):
                for inputs, labels in train_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

    return mask


In [None]:
from torch.optim.optimizer import Optimizer    # To inherit  for the custom SparseSGD

class SparseSGD(Optimizer):
    """
    Sparse SGD with momentum and parameter masks (gradients corresponding to zeros in the mask are ignored)
    """

    def __init__(self, named_params, lr=0.01, momentum=0, weight_decay=0, mask=None):
        """
        Args:
            params (iterable): model parameters to optimize
            mask (dict {param_name: tensor} or None): a binary mask for the parameters.
        """
        params = [p for _, p in named_params]
        defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
        super().__init__(params, defaults)  # initialize base Optimizer

        # Initialize dictionary of parameter → mask mapping
        self.masks = {}
        if mask is not None:
            # Iterate over model parameters with their names
            for name, p in named_params:
                # If a mask exists for this parameter name
                if name in mask:
                    # Map the parameter object to its corresponding mask tensor (= explicitly attaching the right mask tensor to the right parameter)
                    self.masks[p] = mask[name].to(p.device)


    @torch.no_grad()    ### decorator: ensures that everything inside this function by default does not track gradients.
                        ### The function is manually updating model parameters, so we don’t want PyTorch to think these updates are part of the computational graph.
    def step(self, closure=None):
        """
        Performs a single optimization step.
        """
        if closure is not None:
            with torch.enable_grad():
                closure()

        for group in self.param_groups:
            for i, p in enumerate(group['params']):
                if p.grad is None:
                    continue  # skip parameters without gradients

                d_p = p.grad  # get gradient

                # Apply weight decay if specified
                if group['weight_decay'] != 0:
                    d_p = d_p.add(p, alpha=group['weight_decay'])

                # Momentum: maintain a buffer to smooth updates
                param_state = self.state[p]
                if 'momentum_buffer' not in param_state:
                    buf = param_state['momentum_buffer'] = d_p.clone()
                else:
                    buf = param_state['momentum_buffer']
                    buf.mul_(group['momentum']).add_(d_p)
                d_p = buf

                # Apply sparse mask if provided
                if self.masks is not None:
                    mask = self.masks.get(p, None)
                    if mask is not None:
                        d_p = d_p * mask

                # Update the parameter
                p.add_(d_p, alpha=-group['lr'])

## (2.I) Hyperparameter Selection



In [None]:
""" QUICK HEAD TRAINING for the VALIDATION """

set_seed(seed=123)
N_EP = 10
T_MAX = N_EP
LR = 0.001
MOMENTUM = 0.8
WEIGHT_DECAY = 5e-6


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === MODEL ===
dino_vits16 = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
dino_vits16.eval()  # evaluation mode for the backbone
dino_vits16 = dino_vits16.to(device)

# Freeze backbone params
for param in dino_vits16.parameters():
    param.requires_grad = False

# Create the final model (trainable head)
model_central = DINOWithHead(dino_vits16, num_classes=100).to(device)
for param in model_central.head.parameters():
            param.requires_grad = True
model_central.head.train()   # Training mode for head

# === LOSS, OPTIMIZER, SCHEDULER ===
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
      model_central.head.parameters(), # We optimize only the head
      lr=LR,
      momentum=MOMENTUM,
      weight_decay=WEIGHT_DECAY
      )
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_MAX)

checkpoint_path = None

start_time = time.time()
final_model, train_losses, test_losses, train_accuracies, test_accuracies = train_test_model(
    model_central,
    criterion,
    optimizer,
    scheduler,
    train_loader,       # trianing set
    val_test_loader=val_loader,       # validation set
    num_epochs=N_EP,
    checkpoint_path=checkpoint_path,
)
end_time = time.time()
elapsed_time = end_time - start_time

# === SAVING RESULTS ===
results = {
    "epochs": N_EP,
    "train_losses": train_losses,
    "train_accuracies": train_accuracies,
    "test_losses": test_losses,
    "test_accuracies": test_accuracies,
    "time_sec": round(elapsed_time, 2)
}
print(results)

json_filename = "results_central_VALIDATION.json"

with open(json_filename, 'w') as f:
    json.dump(results, f, indent=2)
torch.save(final_model.state_dict(), "final_model_weights_VALIDATION.pth")


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main


Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/10
------------------------------
Train Loss: 2.1737 Train Acc: 0.5098
Val/Test Loss: 1.2196 Acc: 0.6664

Epoch 2/10
------------------------------
Train Loss: 0.9788 Train Acc: 0.7208
Val/Test Loss: 1.0209 Acc: 0.7109

Epoch 3/10
------------------------------
Train Loss: 0.7878 Train Acc: 0.7690
Val/Test Loss: 0.9631 Acc: 0.7249

Epoch 4/10
------------------------------
Train Loss: 0.6884 Train Acc: 0.7959
Val/Test Loss: 0.9257 Acc: 0.7354

Epoch 5/10
------------------------------
Train Loss: 0.6240 Train Acc: 0.8130
Val/Test Loss: 0.9042 Acc: 0.7425

Epoch 6/10
------------------------------
Train Loss: 0.5797 Train Acc: 0.8277
Val/Test Loss: 0.8966 Acc: 0.7439

Epoch 7/10
------------------------------
Train Loss: 0.5496 Train Acc: 0.8373
Val/Test Loss: 0.8914 Acc: 0.7450

Epoch 8/10
------------------------------
Train Loss: 0.5283 Train Acc: 0.8442
Val/Test Loss: 0.8885 Acc: 0.7462

Epoch 9/10
----------------------

In [None]:
def wandb_train_sparse(config=None, pretrained_weights=None, train_loader=None, val_loader=None, verbose = False):

    with wandb.init(config=config, reinit=True):
        config = wandb.config

        # === MODEL ===
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        dino_vits16 = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
        dino_vits16.train()  # train mode for the backbone
        dino_vits16 = dino_vits16.to(device)

        # Create the final model and log pre-trianed weights
        model = DINOWithHead(dino_vits16, num_classes=100).to(device)
        checkpoint = torch.load(pretrained_weights, map_location=device)
        model.load_state_dict(checkpoint)

        for name, param in model.named_parameters():
            if "head" in name: #or "norm" in name or "patch_embed" in name:   # head: already pre-trained
                                                                              # norm: small changes here can distort global statistics
                                                                              # patch_embed: initial representation => risk of degradating features
                param.requires_grad = False
            else:
                param.requires_grad = True
        model.head.eval()   # evaluation mode for the ehad

        # === CHECK FROZEN ===
        # frozen = [n for n,p in model.named_parameters() if not p.requires_grad]
        # trainable = [n for n,p in model.named_parameters() if p.requires_grad]
        # print(f"Frozen params: {frozen}")
        # print(f"Trainable params: {trainable}")

        model = model.to(device)

        # === CALIBRATION: compute fisher mask ===
        start_calib = time.time()
        print("Calibration...")
        mask = None
        if config.target_sparsity > 0:
            print(f"Applying Fisher mask with sparsity {config.target_sparsity}")
            mask = calibrate_mask_centralized(model, train_loader, device, R=config.rounds, final_sparsity=config.target_sparsity,
                                                      lr=config.lr, keep=config.keep, do_train=True, max_batches=config.max_batches)

            optimizer = SparseSGD(
                list(model.named_parameters()),
                lr=config.lr,
                momentum=config.momentum,
                weight_decay=config.weight_decay,
                mask=mask
            )
        else:
            optimizer = torch.optim.SGD(model.parameters(),
                            lr=config.lr,
                            momentum=config.momentum,
                            weight_decay=config.weight_decay)

        end_calib = time.time()     # end timer
        calib_time = end_calib - start_calib
        print(f"Fisher mask calibration took {calib_time:.2f} seconds")

        # === LOSS, SPARSE OPTIMIZER, SCHEDULER ===
        criterion = nn.CrossEntropyLoss()

        #for i, m in enumerate(mask):
        #    n_active = m.sum().item()
        #   total = m.numel()
        #   print(f"Param {i}: {n_active}/{total} ({n_active/total:.3f})")
        #for name, m in mask_dict.items():
        #   ratio = m.sum().item() / m.numel()
        #    print(f"{name}: {ratio:.3f}")

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs)


        # === TRAIN ===
        model, train_losses, val_losses, train_accs, val_accs = train_test_model(
            model,
            criterion,
            optimizer,
            scheduler,
            train_loader,
            val_loader,
            num_epochs=config.epochs,
            verbose=config.verbose
        )

        # === LOG ===
        for epoch in range(config.epochs):
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": train_losses[epoch],
                "val_loss": val_losses[epoch] if val_losses else None,
                "train_accuracy": train_accs[epoch],
                "val_accuracy": val_accs[epoch] if val_accs else None,
            })

        # === BEST LOG ===
        if val_accs:
            best_idx = val_accs.index(max(val_accs))

            wandb.run.summary["best_val_accuracy"] = val_accs[best_idx]
            wandb.run.summary["best_val_loss"] = val_losses[best_idx]
            wandb.run.summary["best_train_accuracy"] = train_accs[best_idx]
            wandb.run.summary["best_train_loss"] = train_losses[best_idx]
            wandb.run.summary["calibration_time_sec"] = calib_time


### Parameter Grid - High sparsity

In [None]:
wandb.login()

sweep_config = {
    'method': 'bayes',    # probabilistic model (based on previous results, it predicts which hyperparameter combinations are likely to lead to better performance)
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'lr': {
            'distribution': 'log_uniform_values',   # log_uniform as numbers are small
            'min': 0.0001,
            'max': 0.01
        },
        'batch_size': {
            'value': 64
        },
        'momentum': {
            'values': [0.8, 0.9]
        },
        'weight_decay': {
            'distribution': 'log_uniform_values',    # log_uniform as numbers are small
            'min': 1e-6,
            'max': 1e-4
        },
        'epochs': {
            'value': 7
        },
        'target_sparsity': {
            'values': [0.8, 0.9]
        },
        'rounds': {
            'value': 1
        },
        'max_batches': {
            'values': [20]
        },
        'verbose': {
            'value': True
        },
        'keep': {
            'value': 'least'
        }
    }
}


sweep_id = wandb.sweep(sweep_config, project="Project_Sparse_Grid_High")

[34m[1mwandb[0m: Currently logged in as: [33mgabriele_[0m ([33mgabriele-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: xaqu0vzw
Sweep URL: https://wandb.ai/gabriele-politecnico-di-torino/Project_Sparse_Grid_High/sweeps/xaqu0vzw


In [None]:
wandb.agent(
    sweep_id,
    function=lambda: wandb_train_sparse(
        pretrained_weights="final_model_weights_VALIDATION.pth",
        train_loader=train_loader,
        val_loader=val_loader
    ),
    count=5
)

[34m[1mwandb[0m: Agent Starting Run: b4s28pih with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 0.0010032743832587992
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.8
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 6.41967039603487e-05


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.8
[Mask Round 1] Sparsity: 0.8000
Fisher mask calibration took 49.88 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.8129 Train Acc: 0.7568
Val/Test Loss: 0.6938 Acc: 0.7855

Epoch 2/7
------------------------------
Train Loss: 0.4725 Train Acc: 0.8505
Val/Test Loss: 0.6232 Acc: 0.8057

Epoch 3/7
------------------------------
Train Loss: 0.2927 Train Acc: 0.9098
Val/Test Loss: 0.5936 Acc: 0.8192

Epoch 4/7
------------------------------
Train Loss: 0.1687 Train Acc: 0.9551
Val/Test Loss: 0.5607 Acc: 0.8303

Epoch 5/7
------------------------------
Train Loss: 0.0983 Train Acc: 0.9823
Val/Test Loss: 0.5581 Acc: 0.8319

Epoch 6/7
------------------------------
Train Loss: 0.0682 Train Acc: 0.9929
Val/Test Loss: 0.5576 Acc: 0.8329

Epoch 7/7
------------------------------
Train Loss: 0.0576 Train Acc: 0.9955
Val/Test Loss: 0.5590 Acc: 0.8342

Training complete in 6m 2

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▄▅▇███
train_loss,█▅▃▂▁▁▁
val_accuracy,▁▄▆▇███
val_loss,█▄▃▁▁▁▁

0,1
best_train_accuracy,0.99545
best_train_loss,0.05763
best_val_accuracy,0.8342
best_val_loss,0.55902
calibration_time_sec,49.88009
epoch,7.0
train_accuracy,0.99545
train_loss,0.05763
val_accuracy,0.8342
val_loss,0.55902


[34m[1mwandb[0m: Agent Starting Run: 3rvj4m2s with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 0.00016698364717152784
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.8
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 4.205034039496119e-06


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main


Calibration...
Applying Fisher mask with sparsity 0.8
[Mask Round 1] Sparsity: 0.8000


  checkpoint = torch.load(pretrained_weights, map_location=device)


Fisher mask calibration took 50.97 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.7011 Train Acc: 0.7894
Val/Test Loss: 0.6232 Acc: 0.8094

Epoch 2/7
------------------------------
Train Loss: 0.4870 Train Acc: 0.8486
Val/Test Loss: 0.5843 Acc: 0.8213

Epoch 3/7
------------------------------
Train Loss: 0.3798 Train Acc: 0.8833
Val/Test Loss: 0.5771 Acc: 0.8232

Epoch 4/7
------------------------------
Train Loss: 0.3011 Train Acc: 0.9126
Val/Test Loss: 0.5554 Acc: 0.8300

Epoch 5/7
------------------------------
Train Loss: 0.2456 Train Acc: 0.9342
Val/Test Loss: 0.5536 Acc: 0.8348

Epoch 6/7
------------------------------
Train Loss: 0.2118 Train Acc: 0.9485
Val/Test Loss: 0.5500 Acc: 0.8347

Epoch 7/7
------------------------------
Train Loss: 0.1946 Train Acc: 0.9557
Val/Test Loss: 0.5501 Acc: 0.8360


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 26s
Best Acc: 0.8360, Best Loss: 0.5501


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▃▅▆▇██
train_loss,█▅▄▂▂▁▁
val_accuracy,▁▄▅▆███
val_loss,█▄▄▂▁▁▁

0,1
best_train_accuracy,0.9557
best_train_loss,0.19456
best_val_accuracy,0.836
best_val_loss,0.55012
calibration_time_sec,50.97492
epoch,7.0
train_accuracy,0.9557
train_loss,0.19456
val_accuracy,0.836
val_loss,0.55012


[34m[1mwandb[0m: Agent Starting Run: ec42s5g4 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 0.001921924501734435
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.9
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 9.706911642308908e-05


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.9
[Mask Round 1] Sparsity: 0.9000
Fisher mask calibration took 50.82 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.8077 Train Acc: 0.7595
Val/Test Loss: 0.6394 Acc: 0.8038

Epoch 2/7
------------------------------
Train Loss: 0.5252 Train Acc: 0.8363
Val/Test Loss: 0.6344 Acc: 0.8089

Epoch 3/7
------------------------------
Train Loss: 0.3727 Train Acc: 0.8840
Val/Test Loss: 0.5577 Acc: 0.8299

Epoch 4/7
------------------------------
Train Loss: 0.2534 Train Acc: 0.9246
Val/Test Loss: 0.5639 Acc: 0.8281

Epoch 5/7
------------------------------
Train Loss: 0.1789 Train Acc: 0.9526
Val/Test Loss: 0.5562 Acc: 0.8325

Epoch 6/7
------------------------------
Train Loss: 0.1322 Train Acc: 0.9716
Val/Test Loss: 0.5509 Acc: 0.8359

Epoch 7/7
------------------------------
Train Loss: 0.1120 Train Acc: 0.9808
Val/Test Loss: 0.5461 Acc: 0.8375


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 23s
Best Acc: 0.8375, Best Loss: 0.5461


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▃▅▆▇██
train_loss,█▅▄▂▂▁▁
val_accuracy,▁▂▆▆▇██
val_loss,██▂▂▂▁▁

0,1
best_train_accuracy,0.98075
best_train_loss,0.11195
best_val_accuracy,0.8375
best_val_loss,0.54608
calibration_time_sec,50.82332
epoch,7.0
train_accuracy,0.98075
train_loss,0.11195
val_accuracy,0.8375
val_loss,0.54608


[34m[1mwandb[0m: Agent Starting Run: gltmzkm0 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 0.005754899028159509
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.8
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 4.885902844661093e-05


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.8
[Mask Round 1] Sparsity: 0.8000
Fisher mask calibration took 50.60 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 3.9637 Train Acc: 0.0961
Val/Test Loss: 3.6377 Acc: 0.1422

Epoch 2/7
------------------------------
Train Loss: 3.3718 Train Acc: 0.1878
Val/Test Loss: 3.2120 Acc: 0.2174

Epoch 3/7
------------------------------
Train Loss: 2.5541 Train Acc: 0.3444
Val/Test Loss: 1.5729 Acc: 0.5557

Epoch 4/7
------------------------------
Train Loss: 1.1920 Train Acc: 0.6547
Val/Test Loss: 1.3002 Acc: 0.6243

Epoch 5/7
------------------------------
Train Loss: 0.6169 Train Acc: 0.8107
Val/Test Loss: 0.9678 Acc: 0.7148

Epoch 6/7
------------------------------
Train Loss: 0.2610 Train Acc: 0.9171
Val/Test Loss: 0.8648 Acc: 0.7585

Epoch 7/7
------------------------------
Train Loss: 0.0984 Train Acc: 0.9756
Val/Test Loss: 0.7787 Acc: 0.7855


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 22s
Best Acc: 0.7855, Best Loss: 0.7787


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▂▃▅▇██
train_loss,█▇▅▃▂▁▁
val_accuracy,▁▂▅▆▇██
val_loss,█▇▃▂▁▁▁

0,1
best_train_accuracy,0.97555
best_train_loss,0.09838
best_val_accuracy,0.7855
best_val_loss,0.77866
calibration_time_sec,50.5951
epoch,7.0
train_accuracy,0.97555
train_loss,0.09838
val_accuracy,0.7855
val_loss,0.77866


[34m[1mwandb[0m: Agent Starting Run: bhuofpau with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 0.0005232212761533024
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.9
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 7.12135932919634e-05


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.9
[Mask Round 1] Sparsity: 0.9000
Fisher mask calibration took 50.49 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.6991 Train Acc: 0.7919
Val/Test Loss: 0.6185 Acc: 0.8138

Epoch 2/7
------------------------------
Train Loss: 0.5180 Train Acc: 0.8390
Val/Test Loss: 0.5806 Acc: 0.8249

Epoch 3/7
------------------------------
Train Loss: 0.4339 Train Acc: 0.8655
Val/Test Loss: 0.5697 Acc: 0.8260

Epoch 4/7
------------------------------
Train Loss: 0.3709 Train Acc: 0.8876
Val/Test Loss: 0.5548 Acc: 0.8303

Epoch 5/7
------------------------------
Train Loss: 0.3232 Train Acc: 0.9057
Val/Test Loss: 0.5575 Acc: 0.8314

Epoch 6/7
------------------------------
Train Loss: 0.2924 Train Acc: 0.9177
Val/Test Loss: 0.5511 Acc: 0.8330

Epoch 7/7
------------------------------
Train Loss: 0.2756 Train Acc: 0.9250
Val/Test Loss: 0.5511 Acc: 0.8320


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 26s
Best Acc: 0.8330, Best Loss: 0.5511


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▃▅▆▇██
train_loss,█▅▄▃▂▁▁
val_accuracy,▁▅▅▇▇██
val_loss,█▄▃▁▂▁▁

0,1
best_train_accuracy,0.91768
best_train_loss,0.29242
best_val_accuracy,0.833
best_val_loss,0.55105
calibration_time_sec,50.48658
epoch,7.0
train_accuracy,0.925
train_loss,0.27561
val_accuracy,0.832
val_loss,0.55107


### Parameter Grid - Low sparsity

In [None]:
wandb.login()

sweep_config = {
    'method': 'bayes',    # probabilistic model (based on previous results, it predicts which hyperparameter combinations are likely to lead to better performance)
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'lr': {
            'distribution': 'log_uniform_values',   # log_uniform as numbers are small
            'min': 0.000001,
            'max': 0.001
        },
        'batch_size': {
            'value': 64
        },
        'momentum': {
            'values': [0.8, 0.9]
        },
        'weight_decay': {
            'distribution': 'log_uniform_values',    # log_uniform as numbers are small
            'min': 1e-6,
            'max': 1e-4
        },
        'epochs': {
            'value': 7
        },
        'target_sparsity': {
            'values': [0.2, 0.3]
        },
        'rounds': {
            'value': 1
        },
        'max_batches': {
            'values': [20]
        },
        'verbose': {
            'value': True
        },
        'keep': {
            'value': 'least'
        }
    }
}


sweep_id = wandb.sweep(sweep_config, project="Project_Sparse_Grid_Low")

Create sweep with ID: mc0bcsnf
Sweep URL: https://wandb.ai/gabriele-politecnico-di-torino/Project_Sparse_Grid_Mid/sweeps/mc0bcsnf


In [None]:
wandb.agent(
    sweep_id,
    function=lambda: wandb_train_sparse(
        pretrained_weights="final_model_weights_VALIDATION.pth",
        train_loader=train_loader,
        val_loader=val_loader
    ),
    count=5)

[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 143y4eyt with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 3.231532659282642e-05
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.3
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 1.5826113863967493e-06


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.3
[Mask Round 1] Target Sparsity: 0.3000
Totale parametri attivi nella mask: 15165964 / 21665664 (70.00%)
Fisher mask calibration took 50.78 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.5346 Train Acc: 0.8421
Val/Test Loss: 0.5120 Acc: 0.8498

Epoch 2/7
------------------------------
Train Loss: 0.4507 Train Acc: 0.8655
Val/Test Loss: 0.4978 Acc: 0.8518

Epoch 3/7
------------------------------
Train Loss: 0.3954 Train Acc: 0.8835
Val/Test Loss: 0.4925 Acc: 0.8536

Epoch 4/7
------------------------------
Train Loss: 0.3556 Train Acc: 0.8982
Val/Test Loss: 0.4901 Acc: 0.8539

Epoch 5/7
------------------------------
Train Loss: 0.3288 Train Acc: 0.9090
Val/Test Loss: 0.4891 Acc: 0.8534

Epoch 6/7
------------------------------
Train Loss: 0.3130 Train Acc: 0.9151
Val/Test Loss: 0.4887 Acc: 0.8531

Epoch 7/7
------------------------------
Train Loss: 0.3056 Train 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 23s
Best Acc: 0.8539, Best Loss: 0.4901


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▃▅▆▇██
train_loss,█▅▄▃▂▁▁
val_accuracy,▁▄▇█▇▇▇
val_loss,█▄▂▁▁▁▁

0,1
best_train_accuracy,0.8982
best_train_loss,0.35564
best_val_accuracy,0.8539
best_val_loss,0.49006
calibration_time_sec,50.78019
epoch,7.0
train_accuracy,0.91718
train_loss,0.30556
val_accuracy,0.8532
val_loss,0.48862


[34m[1mwandb[0m: Agent Starting Run: vrs9rd8a with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 2.4372203058485064e-06
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.2
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 6.917469047724878e-06


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.2
[Mask Round 1] Target Sparsity: 0.2000
Totale parametri attivi nella mask: 17332531 / 21665664 (80.00%)
Fisher mask calibration took 52.26 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.5568 Train Acc: 0.8361
Val/Test Loss: 0.5392 Acc: 0.8439

Epoch 2/7
------------------------------
Train Loss: 0.5214 Train Acc: 0.8461
Val/Test Loss: 0.5282 Acc: 0.8455

Epoch 3/7
------------------------------
Train Loss: 0.4998 Train Acc: 0.8517
Val/Test Loss: 0.5218 Acc: 0.8472

Epoch 4/7
------------------------------
Train Loss: 0.4842 Train Acc: 0.8564
Val/Test Loss: 0.5180 Acc: 0.8479

Epoch 5/7
------------------------------
Train Loss: 0.4736 Train Acc: 0.8595
Val/Test Loss: 0.5159 Acc: 0.8480

Epoch 6/7
------------------------------
Train Loss: 0.4673 Train Acc: 0.8613
Val/Test Loss: 0.5149 Acc: 0.8482

Epoch 7/7
------------------------------
Train Loss: 0.4644 Train 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 14s
Best Acc: 0.8483, Best Loss: 0.5147


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▄▅▆▇██
train_loss,█▅▄▃▂▁▁
val_accuracy,▁▄▆▇███
val_loss,█▅▃▂▁▁▁

0,1
best_train_accuracy,0.86215
best_train_loss,0.46435
best_val_accuracy,0.8483
best_val_loss,0.51468
calibration_time_sec,52.2597
epoch,7.0
train_accuracy,0.86215
train_loss,0.46435
val_accuracy,0.8483
val_loss,0.51468


[34m[1mwandb[0m: Agent Starting Run: fwyz08yd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 5.759944721447012e-05
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.2
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 3.072055659566905e-06


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.2
[Mask Round 1] Target Sparsity: 0.2000
Totale parametri attivi nella mask: 17332531 / 21665664 (80.00%)
Fisher mask calibration took 50.45 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.5186 Train Acc: 0.8447
Val/Test Loss: 0.4910 Acc: 0.8543

Epoch 2/7
------------------------------
Train Loss: 0.3591 Train Acc: 0.8956
Val/Test Loss: 0.4820 Acc: 0.8546

Epoch 3/7
------------------------------
Train Loss: 0.2647 Train Acc: 0.9308
Val/Test Loss: 0.4808 Acc: 0.8544

Epoch 4/7
------------------------------
Train Loss: 0.2051 Train Acc: 0.9535
Val/Test Loss: 0.4847 Acc: 0.8525

Epoch 5/7
------------------------------
Train Loss: 0.1704 Train Acc: 0.9664
Val/Test Loss: 0.4862 Acc: 0.8516

Epoch 6/7
------------------------------
Train Loss: 0.1519 Train Acc: 0.9735
Val/Test Loss: 0.4869 Acc: 0.8519

Epoch 7/7
------------------------------
Train Loss: 0.1438 Train 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.



Training complete in 6m 11s
Best Acc: 0.8546, Best Loss: 0.4820


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▄▆▇▇██
train_loss,█▅▃▂▁▁▁
val_accuracy,▇██▃▁▂▂
val_loss,█▂▁▄▅▅▆

0,1
best_train_accuracy,0.89563
best_train_loss,0.35913
best_val_accuracy,0.8546
best_val_loss,0.48198
calibration_time_sec,50.44975
epoch,7.0
train_accuracy,0.97643
train_loss,0.14378
val_accuracy,0.8519
val_loss,0.4875


[34m[1mwandb[0m: Agent Starting Run: b8t9kblf with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 6.05649595492581e-05
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.3
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 1.710005772392436e-06


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.3
[Mask Round 1] Target Sparsity: 0.3000
Totale parametri attivi nella mask: 15165964 / 21665664 (70.00%)
Fisher mask calibration took 50.49 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.5251 Train Acc: 0.8426
Val/Test Loss: 0.4994 Acc: 0.8509

Epoch 2/7
------------------------------
Train Loss: 0.3989 Train Acc: 0.8826
Val/Test Loss: 0.4885 Acc: 0.8538

Epoch 3/7
------------------------------
Train Loss: 0.3183 Train Acc: 0.9119
Val/Test Loss: 0.4867 Acc: 0.8534

Epoch 4/7
------------------------------
Train Loss: 0.2641 Train Acc: 0.9315
Val/Test Loss: 0.4846 Acc: 0.8533

Epoch 5/7
------------------------------
Train Loss: 0.2299 Train Acc: 0.9454
Val/Test Loss: 0.4848 Acc: 0.8537

Epoch 6/7
------------------------------
Train Loss: 0.2107 Train Acc: 0.9524
Val/Test Loss: 0.4852 Acc: 0.8534

Epoch 7/7
------------------------------
Train Loss: 0.2020 Train 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▃▅▆▇██
train_loss,█▅▄▂▂▁▁
val_accuracy,▁█▇▇█▇▇
val_loss,█▃▂▁▁▁▁

0,1
best_train_accuracy,0.88257
best_train_loss,0.39892
best_val_accuracy,0.8538
best_val_loss,0.48846
calibration_time_sec,50.48786
epoch,7.0
train_accuracy,0.95622
train_loss,0.20202
val_accuracy,0.8533
val_loss,0.48535


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: e056f9ho with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	keep: least
[34m[1mwandb[0m: 	lr: 8.620881643604889e-05
[34m[1mwandb[0m: 	max_batches: 20
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	rounds: 1
[34m[1mwandb[0m: 	target_sparsity: 0.3
[34m[1mwandb[0m: 	verbose: True
[34m[1mwandb[0m: 	weight_decay: 1.1864286139453503e-06


Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_dino_main
  checkpoint = torch.load(pretrained_weights, map_location=device)


Calibration...
Applying Fisher mask with sparsity 0.3
[Mask Round 1] Target Sparsity: 0.3000
Totale parametri attivi nella mask: 15165964 / 21665664 (70.00%)
Fisher mask calibration took 50.44 seconds
Initializing new checkpoint at checkpoints/latest.pth

Epoch 1/7
------------------------------
Train Loss: 0.5185 Train Acc: 0.8436
Val/Test Loss: 0.4922 Acc: 0.8543

Epoch 2/7
------------------------------
Train Loss: 0.3590 Train Acc: 0.8956
Val/Test Loss: 0.4806 Acc: 0.8553

Epoch 3/7
------------------------------
Train Loss: 0.2630 Train Acc: 0.9315
Val/Test Loss: 0.4809 Acc: 0.8540

Epoch 4/7
------------------------------
Train Loss: 0.2033 Train Acc: 0.9548
Val/Test Loss: 0.4837 Acc: 0.8524

Epoch 5/7
------------------------------
Train Loss: 0.1684 Train Acc: 0.9674
Val/Test Loss: 0.4856 Acc: 0.8519

Epoch 6/7
------------------------------
Train Loss: 0.1499 Train Acc: 0.9741
Val/Test Loss: 0.4862 Acc: 0.8515

Epoch 7/7
------------------------------
Train Loss: 0.1417 Train 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▄▆▇▇██
train_loss,█▅▃▂▁▁▁
val_accuracy,▆█▆▃▂▁▁
val_loss,█▁▁▃▄▄▅

0,1
best_train_accuracy,0.89558
best_train_loss,0.35903
best_val_accuracy,0.8553
best_val_loss,0.4806
calibration_time_sec,50.43662
epoch,7.0
train_accuracy,0.97708
train_loss,0.14175
val_accuracy,0.8516
val_loss,0.48659


## (2.II) Final model (Baseline)

In [None]:
set_seed(123)

# === PARAMS ===
SPARSITY = 0.5
LR = 5e-4
N_EP_HEAD = 50
N_EP_BACKBONE = 100
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-5
KEEP = 'least'
MAX_B = 20
C_ROUNDS = 2
VERBOSE = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = {}


# === CARICA DINO PRETRAINED ===
print("Downloading DINO ViT-S/16...")
dino_vits16 = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
dino_vits16.eval()  # Backbone in eval mode
dino_vits16 = dino_vits16.to(device)

# === CREA MODELLO FINALE CON HEAD ===
NUM_CLASSES = 100
model = DINOWithHead(dino_vits16, num_classes=NUM_CLASSES).to(device)



####################
# === Head-only  ===

model_dino = copy.deepcopy(model).to(device)
model_dino.head.train()   # Head in train mode


for name, param in model_dino.named_parameters():
    param.requires_grad = ("head" in name)  # Only head = True

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
    [p for p in model_dino.parameters() if p.requires_grad],
    lr=LR,
    momentum=MOMENTUM,
    weight_decay=WEIGHT_DECAY
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EP_HEAD)

start_time = time.time()
model_best, _, test_losses_head, _, test_acc_head = train_test_model(
    model_dino, criterion, optimizer, scheduler,
    tot_train_loader, val_test_loader=test_loader,
    num_epochs=N_EP_HEAD, checkpoint_path=None, verbose=VERBOSE
)
elapsed_head = time.time() - start_time
print(f"Finished head-only training in {elapsed_head:.2f}s")


########################
# === Backbone-only  ===

for name, param in model_best.named_parameters():    # Continuing with the same model
    param.requires_grad = ("head" not in name)  # freeze head, free the rest
model_best.train()
model_best.head.eval()   # Head in eval mode



print("Calibration for sparse backbone...")
mask = calibrate_mask_centralized(
    model_best, tot_train_loader, device,
    R=C_ROUNDS, final_sparsity=SPARSITY,
    lr=LR, keep=KEEP, do_train=True, max_batches=MAX_B
)

optimizer = SparseSGD(
    list(model_best.named_parameters()),
    lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY,
    mask=mask
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EP_BACKBONE)

start_time = time.time()
model_best_ft, _, test_losses_backbone, _, test_acc_backbone = train_test_model(
    model_best, criterion, optimizer, scheduler,
    tot_train_loader, val_test_loader=test_loader,
    num_epochs=N_EP_BACKBONE, checkpoint_path=None, verbose=VERBOSE
)
elapsed_backbone = time.time() - start_time
print(f"Finished backbone sparse training in {elapsed_backbone:.2f}s")

results = {
    "head_only": {"test_losses": test_losses_head, "test_accuracies": test_acc_head, "time_sec": elapsed_head},
    "backbone_sparse": {"test_losses": test_losses_backbone, "test_accuracies": test_acc_backbone, "time_sec": elapsed_backbone}
}

with open("results_head_backbone.json", "w") as f:
    json.dump(results, f, indent=2)

In [None]:
import matplotlib.pyplot as plt

# Concatenate accuracies and losses
acc_combined = results['head_only']['test_accuracies'] + results['backbone_sparse']['test_accuracies']
loss_combined = results['head_only']['test_losses'] + results['backbone_sparse']['test_losses']

# Epoch axis
epochs = list(range(1, len(acc_combined)+1))

# --- Plot Accuracy ---
plt.figure(figsize=(10,5))
plt.plot(epochs, acc_combined, color='royalblue', marker='o', label='Head-only + Sparse Backbone')
plt.title("Test Accuracy over Epochs", fontsize=16, fontweight='bold')
plt.xlabel("Epoch")
plt.ylabel("Test Accuracy")
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(loc='lower right')
plt.xticks(range(0, len(epochs)+1, 5))  # solo numeri interi
plt.tight_layout()
plt.show()

# --- Plot Loss ---
plt.figure(figsize=(10,5))
plt.plot(epochs, loss_combined, color='seagreen', marker='x', label='Head-only + Sparse Backbone')
plt.title("Test Loss over Epochs", fontsize=16, fontweight='bold')
plt.xlabel("Epoch")
plt.ylabel("Test Loss")
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(loc='upper right')
plt.xticks(range(0, len(epochs)+1, 5))  # solo numeri interi
plt.tight_layout()
plt.show()



In [None]:
import json

# === Load the JSON file ===
with open("results_head_backbone.json", "r") as f:
    results = json.load(f)
results = results['backbone_sparse']
# === Extract key information ===
best_accuracy = max(results.get("test_accuracies", []))
best_loss = min(results.get("test_losses", []))
training_time = results.get("time_sec", None)

# === Print results ===
print(f"Best Accuracy: {best_accuracy}")
print(f"Best Loss: {best_loss}")
print(f"Training Time (s): {training_time}")
