### Libraries

In [None]:
import os, sys, inspect
sys.path.insert(1, os.path.join(sys.path[0], '../'))
import torch
import torchvision as tv
import argparse
import numpy as np
from scipy.stats import binom
from PIL import Image
import matplotlib
import pandas as pd
from tqdm import tqdm
import pdb
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import time
import pathlib
import random
import pandas as pd
import pickle
import pathlib
from __future__ import print_function 
from __future__ import division
import shutil
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import copy
from scipy.special import softmax

### NoisyRC function and PrivQuant algorithm

In [None]:
def NoisyRC(range_bounds, D, sigma):
    """
    Noisy Range Count for float values with Gaussian noise.

    Parameters:
    range_bounds (tuple): A tuple (a, b) representing the range [a, b].
    D (list): The sorted dataset.
    sigma (float): The standard deviation of the Gaussian noise.

    Returns:
    int: The noisy count of elements in the range [a, b].
    """
    a, b = range_bounds
    count = sum(1 for z in D if a <= z <= b)
    noise = np.random.normal(0, sigma)
    noisy_count = count + noise
    return max(0, int(np.floor(noisy_count)))  # Ensure non-negative count

def PrivQuant(D, alpha, rho, seed, lower_bound=0, upper_bound=1, delta=1e-10):
    """
    Differentially Private Quantile Approximation Algorithm without integer conversion.

    Parameters:
    D (list): The sorted dataset.
    alpha (float): The quantile level (e.g., 0.5 for median).
    rho (float): The privacy parameter (smaller = more private).
    lower_bound (float): Lower bound of the search space.
    upper_bound (float): Upper bound of the search space.
    delta (float): Small positive value to ensure convergence.

    Returns:
    float: A differentially private approximation of the quantile x_{(m)}.
    """

    
    n = len(D)
    max_iterations = int(np.ceil(np.log2((upper_bound - lower_bound) / delta)))
    sigma = np.sqrt(max_iterations / (2 * rho)) # Noise scale for Gaussian mechanism
    m = int(np.ceil((1 - alpha) * (n + 1)))

    left, right = lower_bound, upper_bound
    random.seed(seed)
    for i in range(max_iterations):
        mid = (left + right) / 2
        c = NoisyRC((lower_bound, mid), D, sigma)
        
        if c < m:
            left = mid + delta
        else:
            right = mid

    return np.round((left + right) / 2, 2)

### Helping Function for  Lap-Hist Method

In [None]:
def dp_quantile_noisy_hist(x, q, epsilon, seed, bins=50, domain=(0.0, 1.0), rng=None):
    """
    Differentially private quantile using a Laplace-noised histogram (ε-DP).

    Args:
        x (array-like): data vector (numeric).
        q (float): desired quantile in (0,1).
        epsilon (float): privacy budget for the entire histogram.
        domain (tuple): (lo, hi) public bounds for clipping/binning.
        bins (int): number of fixed, public bins.
        rng: np.random.Generator (optional).

    Returns:
        float: DP quantile estimate (can lie between data points).

    Privacy & assumptions:
        - Data are clipped to the public domain (lo, hi).
        - Build a fixed-bin histogram, add Lap(1/ε) noise to each bin count.
        - Because each record contributes to exactly one bin, releasing
          the full noisy histogram is ε-DP under add/remove adjacency.
        - Quantile is computed from the noisy cumulative counts.

    Notes:
        - Works best if a reasonable public domain is known.
        - For stability, negative noisy counts are floored at 0.
    """
    x = np.asarray(x, dtype=float)
    if x.size == 0:
        raise ValueError("x must be non-empty.")
    if not (0 < q < 1):
        raise ValueError("q must be in (0,1).")
    if epsilon <= 0:
        raise ValueError("epsilon must be > 0.")
    if rng is None:
        rng = np.random.default_rng(seed)

    lo, hi = domain
    if not (lo < hi):
        raise ValueError("domain must satisfy lo < hi.")

    # Clip to public domain
    xc = np.clip(x, lo, hi)

    # Fixed public bins
    edges = np.linspace(lo, hi, bins + 1)
    #print(f"Bins: {edges}")
    counts, _ = np.histogram(xc, bins=edges)
    #print(f"Counts of histogram: {counts}")

    # Laplace noise to each bin (scale = 1/ε)
    noise = rng.laplace(loc=0.0, scale=1.0/epsilon, size=bins)
    #print(f"Noise for each bin: {noise}")
    noisy = np.maximum(counts + noise, 0.0)
    #print(noisy)

    # Cumulative proportion
    csum = np.cumsum(noisy)
    if csum[-1] <= 0:
        # extremely unlikely unless ε is tiny and n is tiny
        return float(np.median(xc))

    target = q * csum[-1]
    j = np.searchsorted(csum, target)  # first bin reaching the target

    j = int(np.clip(j, 0, bins - 1))
    # Linear interpolation within the bin (simple, uniform-within-bin)
    bin_lo, bin_hi = edges[j], edges[j + 1]
    prev = csum[j - 1] if j > 0 else 0.0
    within = (target - prev) / max(noisy[j], 1e-12)
    within = np.clip(within, 0.0, 1.0)
    return float(bin_lo + within * (bin_hi - bin_lo))

### Helping Function for EXPONQ and model training

In [None]:
dirname = str(pathlib.Path().absolute())


def get_qtilde(n,alpha,gamma,epsilon,m):
    qtilde = (n+1)*(1-alpha)/(n*(1-gamma*alpha))+2/(epsilon*n)*np.log(m/(gamma*alpha))
    qtilde = min(qtilde, 1-1e-12)
    return qtilde

def generate_scores(n):
    return np.random.uniform(size=(n,))

def hist_2_cdf(cumsum, bins, n):
    def _cdf(t):
        if t > bins[-2]:
            return 1.0
        elif t < bins[1]:
            return 0.0
        else:
            return 1-cumsum[np.searchsorted(bins, t)]/n
    return _cdf

def get_private_quantile(scores, alpha, epsilon, gamma, bins):
    n = scores.shape[0]
    epsilon_normed = epsilon*min(alpha, 1-alpha)
    # Get the quantile
    qtilde = get_qtilde(n, alpha, gamma, epsilon, bins.shape[0])
    scores = scores.squeeze()
    score_to_bin = np.digitize(scores,bins)
    binned_scores = bins[np.minimum(score_to_bin,bins.shape[0]-1)]
    w1 = np.digitize(binned_scores, bins)
    w2 = np.digitize(binned_scores, bins, right=True)
    # Clip bins
    w1 = np.maximum(np.minimum(w1,bins.shape[0]-1),0)
    w2 = np.maximum(np.minimum(w2,bins.shape[0]-1),0)
    lower_mass = np.bincount(w1,minlength=bins.shape[0]).cumsum()/qtilde
    upper_mass = (n-np.bincount(w2,minlength=bins.shape[0]).cumsum())/(1-qtilde)
    w = np.maximum( lower_mass , upper_mass )
    sampling_probabilities = softmax(-(epsilon_normed/2)*w)
    # Check
    sampling_probabilities = sampling_probabilities/sampling_probabilities.sum()
    qhat = np.random.choice(bins,p=sampling_probabilities)
    return qhat

# Optimal gamma is a root.
def get_optimal_gamma(scores,n,alpha,m,epsilon):
    a = alpha**2
    b = - ( alpha*epsilon*(n+1)*(1-alpha)/2 + 2*alpha )
    c = 1
    best_q = 1
    gamma1 = (-b + np.sqrt(b**2 - 4*a*c))/(2*a)
    gamma2 = (-b - np.sqrt(b**2 - 4*a*c))/(2*a)

    gamma1 = min(max(gamma1,1e-12),1-1e-12)
    gamma2 = min(max(gamma2,1e-12),1-1e-12)

    bins = np.linspace(0,1,m)

    q1 = get_private_quantile(scores, alpha, epsilon, gamma1, bins)
    q2 = get_private_quantile(scores, alpha, epsilon, gamma2, bins)

    return (gamma1, q1) if q1 < q2 else (gamma2, q2)

def get_optimal_gamma_m(n, alpha, epsilon):
    candidates_m = np.logspace(4,6,50).astype(int)
    scores = np.random.rand(n,1)
    best_m = int(1/alpha)
    best_gamma = 1
    best_q = 1
    for m in candidates_m:
        gamma, q = get_optimal_gamma(scores,n,alpha,m,epsilon)
        if q < best_q:
            best_q = q
            best_m = m
            best_gamma = gamma
    return best_m, best_gamma



def get_conformal_scores(scores, labels):
    conformal_scores = torch.tensor([scores[i,labels[i]] for i in range(scores.shape[0])]) 
    return conformal_scores 

def get_shat_from_scores_private(scores, alpha, epsilon, gamma, score_bins):
    shat = get_private_quantile(scores, alpha, epsilon, gamma, score_bins)
    return shat 


def get_shat_from_scores(scores, alpha):
    return np.quantile(scores,1-alpha)

# def get_model(private=False, feature_extract=True, cache= dirname + '/.cache/'):
#     model_ft = models.resnet18(pretrained=True)
#     set_parameter_requires_grad(model_ft, feature_extract)
#     num_ftrs = model_ft.fc.in_features
#     model_ft.fc = nn.Linear(num_ftrs, 3)

#     data = torch.load('./.cache/nonprivatemodel_best.pth.tar')

#     model_ft.load_state_dict(data)
#     model_ft.cuda()
#     model_ft.eval()

#     return model_ft






def get_model(private, feature_extract=True, cache= dirname + '/.cache/'):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    model = models.resnet18(pretrained=True).to(device)
    set_parameter_requires_grad(model, feature_extract)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 3)

    if private:
        model_path = ""  #TO DO: Put path to private model here
    else:
        model_path = ""  #TO DO: Put path to non-private model here

    # Load the model state dict
    state_dict = torch.load(model_path, map_location=device)
    
    # Remove the "_module." prefix from keys if present
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if k.startswith("_module."):
            name = k[8:]  # Remove "_module." prefix
        else:
            name = k
        new_state_dict[name] = v

    # Load the modified state dict
    model.load_state_dict(new_state_dict)
    model.to(device)
    model.eval()

    return model







# Computes logits and targets from a model and loader
def get_logits_targets(model, loader):
    logits = torch.zeros((len(loader.dataset), 3)) # 3 classes in XRAY.
    labels = torch.zeros((len(loader.dataset),))
    i = 0
    print(f'Computing logits for model (only happens once).')
    with torch.no_grad():
        for x, targets in tqdm(loader):
            batch_logits = model(x.cuda()).detach().cpu()
            logits[i:(i+x.shape[0]), :] = batch_logits
            labels[i:(i+x.shape[0])] = targets.cpu()
            i = i + x.shape[0]
    
    # Construct the dataset
    dataset_logits = torch.utils.data.TensorDataset(logits, labels.long()) 
    return dataset_logits



def get_dataset_shuffle_split(datasetpath, num_calib, num_val, seed):
    # Create training and validation datasets
    input_size = 224
    batch_size = 256

    # Data augmentation and normalization for training
    # Just normalization for validation
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }

    print("Initializing Datasets and Dataloaders...")
    fix_randomness(seed)
    image_datasets = {x: torchvision.datasets.ImageFolder(os.path.join(datasetpath, x), data_transforms[x]) for x in ['train', 'val']}
    temp = torch.utils.data.ConcatDataset([image_datasets['train'],image_datasets['val']])
    image_datasets['train'], image_datasets['val'] = torch.utils.data.random_split(temp,[len(temp)-num_calib-num_val,num_calib+num_val])
    return image_datasets
    

def get_logits_dataset(private, datasetname, datasetpath, num_calib, num_val, seed, cache= dirname + '/.cache/'):
    fname = cache + datasetname + '/' + 'private' + '.pkl'  if private else cache + datasetname + '/nonprivate.pkl'
    batch_size = 256

    image_datasets = get_dataset_shuffle_split(datasetpath, num_calib, num_val, seed)
    # If the file exists, load and return it.
    if os.path.exists(fname):
        with open(fname, 'rb') as handle:
            return pickle.load(handle), image_datasets

    # Else we will load our model, run it on the dataset, and save/return the output.
    model = get_model(private, True)

    # get the datasets and loaders
    image_datasets = get_dataset_shuffle_split(datasetpath, num_calib, num_val, seed)

    dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

    # Get the logits and targets
    dataset_logits_dict = {x: get_logits_targets(model, dataloaders_dict[x]) for x in ['train','val']}

    # Save the dataset 
    os.makedirs(os.path.dirname(fname), exist_ok=True)
    with open(fname, 'wb') as handle:
        pickle.dump(dataset_logits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return dataset_logits_dict, image_datasets

def fix_randomness(seed=0):
    np.random.seed(seed=seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)


def get_metrics_precomputed(est_labels,labels,losses,num_classes):
    labels = torch.nn.functional.one_hot(labels,num_classes)
    empirical_losses = (losses.view(1,-1) * (labels * (1-est_labels))).sum(dim=1)
    sizes = est_labels.sum(dim=1)
    return empirical_losses, sizes 

### Helping function for model training

In [None]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

def save_checkpoint(state, is_best, filename="checkpoint.tar", private=False):
    root = f'./.cache/'
    os.makedirs(root, exist_ok=True)
    root = root + 'private' if private else root + 'nonprivate'
    torch.save(state, root+filename)
    if is_best:
        shutil.copyfile(root+filename, root+"model_best.pth.tar")

def fix_randomness(seed):
    ### Fix randomness 
    np.random.seed(seed=seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)


def platt_logits(calib_dataset, max_iters=10, lr=0.01, epsilon=0.01):
    calib_loader = torch.utils.data.DataLoader(calib_dataset, batch_size=1024, shuffle=False, pin_memory=True) 
    nll_criterion = nn.CrossEntropyLoss().cuda()

    T = nn.Parameter(torch.Tensor([1.3]).cuda())

    optimizer = optim.SGD([T], lr=lr)
    for iter in range(max_iters):
        T_old = T.item()
        for x, targets in calib_loader:
            optimizer.zero_grad()
            x = x.cuda()
            x.requires_grad = True
            out = x/T
            loss = nll_criterion(out, targets.long().cuda())
            loss.backward()
            optimizer.step()
        if abs(T_old - T.item()) < epsilon:
            break
    return T 

### Helping function for comformal prediction experiment

In [None]:
def trial_precomputed(conformal_scores, raw_scores, alpha, epsilon, gamma, score_bins, num_calib, seed, privateconformal):
    total = conformal_scores.shape[0]
    perm = torch.randperm(conformal_scores.shape[0])
    conformal_scores = conformal_scores[perm]
    raw_scores = raw_scores[perm]
    calib_conformal_scores, val_conformal_scores = (1-conformal_scores[0:num_calib], 1-conformal_scores[num_calib:])
    calib_raw_scores, val_raw_scores = (1-raw_scores[0:num_calib], 1-raw_scores[num_calib:])

    # Always compute non-private results
    threshold_nonpriv = get_shat_from_scores(calib_conformal_scores, alpha)
    corrects_nonpriv = (val_conformal_scores < threshold_nonpriv)
    sizes_nonpriv = (val_raw_scores < threshold_nonpriv).sum(dim=1)

    # Initialize private outputs (so they exist even when privateconformal is False)
    shat = None
    threshold_PrivQuant = None
    threshold_Lap_hist = None

    corrects = None
    corrects_PrivQuant = None
    corrects_Lap_hist = None

    sizes = None
    sizes_PrivQuant = None
    sizes_Lap_hist = None

    if privateconformal:
        # Only compute private results if privateconformal is True
        shat = get_shat_from_scores_private(calib_conformal_scores, alpha, epsilon, gamma, score_bins)

        # privacy budget for PrivQuant 
        epsilon_conform = (epsilon**2) / 2
        threshold_PrivQuant = PrivQuant(calib_conformal_scores, alpha, epsilon_conform, seed)

        q = 1 - alpha
        threshold_Lap_hist = dp_quantile_noisy_hist(calib_conformal_scores, q, epsilon, seed)

        corrects = (val_conformal_scores < shat)
        corrects_PrivQuant = (val_conformal_scores < threshold_PrivQuant)
        corrects_Lap_hist = (val_conformal_scores < threshold_Lap_hist)

        sizes = (val_raw_scores < shat).sum(dim=1)
        sizes_PrivQuant = (val_raw_scores < threshold_PrivQuant).sum(dim=1)
        sizes_Lap_hist = (val_raw_scores < threshold_Lap_hist).sum(dim=1)

    # Build the 12-tuple return (consistent ordering)
    # Order: corrects, corrects_PrivQuant, corrects_Lap_hist, corrects_nonpriv,
    #        sizes, sizes_PrivQuant, sizes_Lap_hist, sizes_nonpriv,
    #        shat, threshold_PrivQuant, threshold_Lap_hist, threshold_nonpriv

    return (
        corrects.float().mean().item() if corrects is not None else np.nan,
        corrects_PrivQuant.float().mean().item() if corrects_PrivQuant is not None else np.nan,
        corrects_Lap_hist.float().mean().item() if corrects_Lap_hist is not None else np.nan,
        corrects_nonpriv.float().mean().item(),
        sizes if sizes is not None else torch.tensor([]),
        sizes_PrivQuant if sizes_PrivQuant is not None else torch.tensor([]),
        sizes_Lap_hist if sizes_Lap_hist is not None else torch.tensor([]),
        sizes_nonpriv,
        float(shat) if shat is not None else np.nan,
        float(threshold_PrivQuant) if threshold_PrivQuant is not None else np.nan,
        float(threshold_Lap_hist) if threshold_Lap_hist is not None else np.nan,
        float(threshold_nonpriv)
    )

## Training the models

In [None]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False, is_private=False, privacy_engine=None):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    # Define paths for saving the best models
    if is_private:
        model_path = ".../best_model_private.pth" #TO DO: Put the complete path where you want to save the model best_model_private.pth
    else:
        model_path = ".../best_model_nonprivate.pth" #TO DO: Put the complete path where you want to save the model best_model_nonprivate.pth

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    if is_inception and phase == 'train':
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4 * loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        if is_private:
                            # For private models, use the privacy engine's step
                            optimizer.step()
                            optimizer.zero_grad()  # Opacus requires zero_grad after step
                        else:
                            # For non-private models, use standard step
                            optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                # Save the best model weights
                torch.save(best_model_wts, model_path)

            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history



if __name__ == "__main__":
    # print("PyTorch Version: ", torch.__version__)
    # print("Torchvision Version: ", torchvision.__version__)

    # Top level data directory
    data_dir = ".../covid_chest_xray/data/imagefolder"  #TO DO: Put the directory where you saved the CoronaHack dataset
    EPSILON =1
    DELTA = 1e-5
    MAX_GRAD_NORM = 2  # Maximum gradient norm for clipping
    # Number of classes in the dataset
    num_classes = 3

    # Batch size for training
    batch_size = 8

    # Number of epochs to train for
    num_epochs = 15

    # Flag for feature extracting
    feature_extract = True

    # Models to choose from [resnet, alexnet, vgg, squeezenet, densenet, inception]
    model_ft = models.resnet18(pretrained=True)
    set_parameter_requires_grad(model_ft, feature_extract)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, num_classes)

    # Create training and validation dataloaders
    image_datasets = get_dataset_shuffle_split(data_dir, num_calib=1000, num_val=500, seed=0)
    dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

    # Detect if we have a GPU available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Send the model to GPU
    model_ft = model_ft.to(device)

    # Gather the parameters to be optimized/updated
    params_to_update = model_ft.parameters()
    print("Params to learn:")
    if feature_extract:
        params_to_update = []
        for name, param in model_ft.named_parameters():
            if param.requires_grad:
                params_to_update.append(param)
                print("\t", name)
    else:
        for name, param in model_ft.named_parameters():
            if param.requires_grad:
                print("\t", name)

    # Setup the loss function
    criterion = nn.CrossEntropyLoss()

    # Non-private model training
    print("Training non-private model...")
    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
    model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=False, is_private=False)

    # Private model training
    print("Training private model...")
    model_ft_private = models.resnet18(pretrained=True)
    set_parameter_requires_grad(model_ft_private, feature_extract)
    model_ft_private.fc = nn.Linear(num_ftrs, num_classes)
    model_ft_private = model_ft_private.to(device)

    optimizer_ft_private = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

    # Add differential privacy using Opacus
    privacy_engine = PrivacyEngine()
    model_ft_private, optimizer_ft_private, dataloaders_dict['train'] = privacy_engine.make_private_with_epsilon(
    module=model_ft_private,
    optimizer=optim.SGD(model_ft_private.parameters(), lr=0.001, momentum=0.9),
    data_loader=dataloaders_dict['train'],
    epochs=num_epochs,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=MAX_GRAD_NORM,)


    model_ft_private, hist_private = train_model(model_ft_private, dataloaders_dict, criterion, optimizer_ft_private, num_epochs=num_epochs, is_inception=False, is_private=True, privacy_engine=privacy_engine)

### Experiment

In [None]:
def experiment(alpha, epsilon, num_calib, num_val, seed, datasetpath, privatemodel, privateconformal):
    #df_list = []
    mstar, gammastar = get_optimal_gamma_m(num_calib, alpha, epsilon)
    score_bins = np.linspace(0, 1, mstar)
    fname = f'.cache/opt_{alpha}_{epsilon}_{num_calib}_{mstar}bins_pm_{privatemodel}_pc_{privateconformal}_dataframe.pkl'
    #fname = f'.cache/opt_{alpha}_{epsilon}_{num_calib}_{mstar}bins_dataframe.pkl'


    # Define the expected columns
    expected_columns = ["$\\hat{s}$", "$\\hat{q}_$PrivQuant", "threshold_Lap_hist", "threshold_nonpriv", "Anas et. al", "PrivQuant","Lap_hist", "NonprivQuant", "sizes_Anas et. al", "sizes_PrivQuant", "sizes_Lap_hist", "sizes_NonprivQuant", "$\\alpha$", "$\\epsilon$"]

    try:
        df = pd.read_pickle(fname)
        return df
    except FileNotFoundError:
        pass
    
    all_data, image_dataset = get_logits_dataset(privatemodel, 'xray', datasetpath, num_calib, num_val, seed=0, cache=dirname + '/.cache/')
    print('Dataset loaded')
    dataset_precomputed = all_data['val']

    classes_array = ['bacterial pneumonia', 'normal', 'viral pneumonia']
    T = platt_logits(dataset_precomputed)

    logits, labels = dataset_precomputed.tensors
    scores = (logits / T.cpu()).softmax(dim=1)

    with torch.no_grad():
        conformal_scores = get_conformal_scores(scores, labels)
        local_df_list = []
        for i in tqdm(range(num_trials)):
            seed = seed +i
            cvg1, cvg2, cvg3,cvg4, szs1, szs2, szs3,szs4, shat, threshold_PrivQuant,threshold_Lap_hist, threshold_nonpriv = trial_precomputed(conformal_scores, scores, alpha, epsilon, gammastar, score_bins, num_calib, seed, privateconformal)
            dict_local = {
                "NonprivQuant": cvg4,
                "sizes_NonprivQuant": [szs4],
                "Anas et. al": cvg1 if privateconformal else np.nan,
                "PrivQuant": cvg2 if privateconformal else np.nan,
                "Lap_hist": cvg3 if privateconformal else np.nan,
                "sizes_Anas et. al": [szs1] if privateconformal and szs1 is not None else [torch.tensor([])],
                "sizes_PrivQuant": [szs2] if privateconformal and szs2 is not None else [torch.tensor([])],
                "sizes_Lap_hist": [szs3] if privateconformal and szs3 is not None else [torch.tensor([])],
                "$\\hat{s}$": shat if privateconformal else np.nan,
                "$\\hat{q}_$PrivQuant": threshold_PrivQuant if privateconformal else np.nan,
                "Lap_hist": threshold_Lap_hist if privateconformal else np.nan,
                "threshold_nonpriv": threshold_nonpriv,
                "$\\alpha$": alpha,
                "$\\epsilon$":epsilon,
                "PrivateConformal": privateconformal,  
                "PrivateModel": privatemodel          
    }
            df_local = pd.DataFrame(dict_local)
            local_df_list.append(df_local)

        # Combine all local DataFrames into one
        df = pd.concat(local_df_list, axis=0, ignore_index=True)

        os.makedirs('.cache', exist_ok=True)
        df.to_pickle(fname)
    return df




shutil.rmtree('.cache', ignore_errors=True)
if __name__ == "__main__":
    sns.set(palette='pastel', font='serif')
    sns.set_style('white')
    fix_randomness(seed=0)

    datasetpath = '.../covid_chest_xray/data/imagefolder'  #TO DO: Put complete path to dataset here
    privateconformals = [False, True]
    privatemodels = [False, True]

    alpha = 0.1
    epsilon = 1
    num_calib = 1000
    num_val = 500
    num_trials = 1000
    seed = 123

    save_path = 'df_list_Corona_Hack.pkl'

    if os.path.exists(save_path):
        with open(save_path, 'rb') as f:
            df_list = pickle.load(f)
    else:
        df_list = []
        for privateconformal in privateconformals:
            for privatemodel in privatemodels:
                df_list.append(
                    experiment(alpha, epsilon, num_calib, num_val, seed,
                               datasetpath=datasetpath,
                               privatemodel=privatemodel,
                               privateconformal=privateconformal)
                )

        

### Saving the result

In [None]:
# Saving the result of the experiment
with open(save_path, 'wb') as f:
            pickle.dump(df_list, f)


### Processing the results for plotting

In [None]:
def compute_trial_averages(size_series, trials=1000, eval_points=500):
    """
    Compute average set size per trial from size series
    Returns: Array of 1000 average sizes (one per trial)
    """
    # First flatten and convert all values to floats
    sizes = []
    for val in size_series.explode().dropna():
        if isinstance(val, torch.Tensor):
            sizes.append(float(val.item()))
        else:
            sizes.append(float(val))
    
    if len(sizes) != trials * eval_points:
        print(f"Warning: Expected {trials*eval_points} size points, got {len(sizes)}")
        return np.array([])
    
    # Reshape to (trials, eval_points) and compute trial averages
    size_array = np.array(sizes).reshape(trials, eval_points)
    return np.mean(size_array, axis=1)

def safe_to_dataframe(data_dict):
    """Convert dictionary to DataFrame, handling unequal lengths"""
    if not data_dict:
        return pd.DataFrame()
    
    max_len = max(len(v) for v in data_dict.values())
    padded = {k: np.pad(v, (0, max_len - len(v)), 
             mode='constant', constant_values=np.nan)
             for k, v in data_dict.items()}
    return pd.DataFrame(padded)

def main():
    # Load your data
    try:
        with open('df_list_Corona_Hack.pkl', 'rb') as f:
            df_list = pickle.load(f)
    except FileNotFoundError:
        print("Error: Input file not found")
        return
    except pickle.PickleError:
        print("Error: Could not unpickle the file")
        return

    # Setting names mapping
    setting_names = {
        (False, False): "NonPrivateModel_NonPrivateConformal",
        (False, True): "NonPrivateModel_PrivateConformal",
        (True, False): "PrivateModel_NonPrivateConformal",
        (True, True): "PrivateModel_PrivateConformal"
    }

    # Initialize storage
    results = {
        'coverage': {setting: {} for setting in setting_names.values()},
        'avg_size': {setting: {} for setting in setting_names.values()}  # For trial averages
    }

    # Process each setting's DataFrame
    for df_idx, df in enumerate(df_list):
        try:
            private_model = df["PrivateModel"].iloc[0]
            private_conformal = df["PrivateConformal"].iloc[0]
            setting = setting_names[(private_model, private_conformal)]
            
            # Coverage data (unchanged)
            for method in ["NonprivQuant", "Anas et. al", "PrivQuant", "Lap_hist"]:
                if method in df.columns:
                    cov_data = df[method].dropna().astype(float).values
                    results['coverage'][setting][method] = cov_data
            
            # Size data - compute trial averages
            for method in ["NonprivQuant", "Anas et. al", "PrivQuant", "Lap_hist"]:
                size_key = f"sizes_{method}"
                if size_key in df.columns:
                    trial_avgs = compute_trial_averages(df[size_key])
                    if len(trial_avgs) > 0:
                        results['avg_size'][setting][method] = trial_avgs
                    else:
                        print(f"Warning: No size averages for {method} in {setting} (DF #{df_idx+1})")
        except Exception as e:
            print(f"Error processing dataframe #{df_idx+1}: {str(e)}")
            continue

    # Save coverage data
    for setting in setting_names.values():
        if results['coverage'][setting]:
            df = safe_to_dataframe(results['coverage'][setting])
            if not df.empty:
                df.to_csv(f'coverage_{setting}.csv', index=False)

    # Save average size data (one file per setting)
    for setting in setting_names.values():
        if results['avg_size'][setting]:
            df = safe_to_dataframe(results['avg_size'][setting])
            if not df.empty:
                df.to_csv(f'avg_size_{setting}.csv', index=False)
            else:
                print(f"No average size data for {setting}")

    print("Processing complete. Files saved:")
    print("- coverage_[setting].csv")
    print("- avg_size_[setting].csv")

if __name__ == "__main__":
    main()