Copyright (c) 2025 Qualcomm Technologies, Inc.
All Rights Reserved.

# Notebook for collecting and analyzing results

The scripts `run_uci.py` and `run_pets.py` store results in a `.npz` file format. This notebook demonstrates how to parse the results stored in those files and analyze if the observed coverage is statistically valid.

In [1]:
import os
from typing import List, Optional

import numpy as np
import scipy as sp

## Functions used to collect saved results 

In [2]:
def get_ds_n_seeds(name: str) -> int:
    """Return the number random seeds used for each dataset.
    We use 10 random seeds for the smallest datasets in the list
    below and 20 for the remaining ones.
    """
    return 10 if name in ["boston", "concrete", "energy", "wine", "yacht"] else 20


def get_ds_hyperparam_type(name: str) -> str:
    """Return the type of hyperparameters optimization used for each dataset."""
    return (
        "marglik"
        if name in ["boston", "concrete", "energy", "kin8nm", "power", "wine", "yacht"]
        else "cv"
    )


def get_results_helper(
    dataset_name: str,
    results_path: str,
    split: bool = False,
    split_calib_size: Optional[float] = None,
    hyperparam_type: Optional[str] = None,
    seeds: Optional[List[int]] = None,
) -> dict:
    """Load the corresponding file with the results of a UCI experiment and
    parse it to a python dictionary."""
    # Get file extension according to hyperparameter optimization strategy.
    if hyperparam_type is None:
        ext = f"_{get_ds_hyperparam_type(dataset_name)}"
    else:
        ext = hyperparam_type
    # If seeds not provided, define them as in the paper.
    if seeds is None:
        n_seeds = get_ds_n_seeds(dataset_name)
        seeds = np.arange(n_seeds) + 1
    # Collect results.
    res = []
    for seed in seeds:
        if split_calib_size is None and split is False:
            fn = os.path.join(results_path, dataset_name, f"{dataset_name}{ext}_{seed}.npz")
        elif split is False:
            fn = os.path.join(
                results_path,
                dataset_name,
                f"{dataset_name}{ext}_full{int(split_calib_size*100)}_{seed}.npz",
            )
        elif split is True:
            fn = os.path.join(
                results_path,
                dataset_name,
                f"{dataset_name}{ext}_split{int(split_calib_size*100)}_{seed}.npz",
            )
        else:
            raise ValueError("unrecognised args")
        try:
            res_per_rep = np.load(fn)
        except:
            raise FileNotFoundError(f"Missing file {fn}")
        # Add results for that seed to the list
        res.append(
            {
                "cp_intervals": res_per_rep["cp_intervals"],
                "interval_widths_avg": res_per_rep["interval_widths_avg"].mean(axis=-1),
                "coverage": res_per_rep["coverage"].mean(axis=-1),
                "test_mse": res_per_rep["test_mse"].mean(),
                "test_loglik": res_per_rep["test_loglik"].mean(),
                "test_loglik_bayes": res_per_rep["test_loglik_bayes"].mean(),
            }
        )
    # Create dictionary from list of results
    res = {k: np.array([dd[k] for dd in res]) for k in res[0]}
    res["cp_intervals"] = np.moveaxis(
        res["cp_intervals"], 0, -1
    )  # NOTE: cannot take mean here as different individual test points
    res["interval_widths_avg"] = np.moveaxis(res["interval_widths_avg"], 0, -1)
    res["coverage"] = np.moveaxis(res["coverage"], 0, -1)
    if split:
        res["methods"] = [
            method + f"_{int(split_calib_size*100)}" for method in res_per_rep["methods"]
        ]
    else:
        res["methods"] = res_per_rep["methods"].tolist()
    res["sig_lvls"] = res_per_rep["sig_lvls"]
    res["dataset"] = dataset_name
    res["split_calib_size"] = split_calib_size
    return res


def get_results_helper_pets(
    results_path: str,
    cp_type: str,
    split_calib_size: float = 0.25,
    seeds: Optional[List[int]] = None,
) -> dict:
    """Load the corresponding file with the results of an Oxford Pets experiment
    and parse it to a python dictionary."""
    if seeds is None:
        seeds = np.arange(20) + 1
    res = []
    for seed in seeds:
        if cp_type == "full":
            fn = os.path.join(results_path, f"pet_{seed}.npz")
        elif cp_type == "full_refine":
            fn = os.path.join(results_path, f"pet_full{int(split_calib_size*100)}_{seed}.npz")
        elif cp_type == "split":
            fn = os.path.join(results_path, f"pet_split{int(split_calib_size*100)}_{seed}.npz")
        else:
            raise ValueError("unrecognised args")

        try:
            res_per_rep = np.load(fn)
        except FileNotFoundError:
            print(fn)
        else:
            res.append(
                {
                    "volumes": res_per_rep["volumes"],
                    "coverage": res_per_rep["coverage"],
                    "test_loc_error": res_per_rep["test_loc_error"].item(),
                    "test_acc": res_per_rep["test_acc"].item(),
                }
            )
    res = {k: np.array([dd[k] for dd in res]) for k in res[0]}
    res["volumes"] = np.moveaxis(res["volumes"], 0, -1)
    res["interval_widths_avg"] = res["volumes"]
    res["coverage"] = np.moveaxis(res["coverage"], 0, -1)

    res["sig_lvls"] = res_per_rep["sig_lvls"]

    if cp_type in ["split", "full_refine"]:
        res["methods"] = [
            method + f"_{int(split_calib_size*100)}" for method in res_per_rep["methods"]
        ]
    else:
        res["methods"] = res_per_rep["methods"].tolist()
    res["dataset"] = "pets"
    res["split_calib_size"] = split_calib_size
    return res

## Function used to evaluate valid coverage

In [3]:
# Train set sizes (in full methods) used to evaluate exact marginal coverage distribution
get_ds_full_n = {
    "yacht": 277,
    "boston": 455,
    "energy": 691,
    "bike": 9797,
    "protein": 41157,
    "facebook_2": 73179,
    "concrete": 927,
    "wine": 1439,
    "kin8nm": 7372,
    "power": 8611,
    "community": 1794,
    "facebook_1": 36853,
    "pets": 7349,
}


def cov_check(
    dataset_name, sig_lvl: float, method: str, emp_cov: float, split_calib_size: float
) -> bool:
    """Check if the observed coverage is statistically valid given the dataset size."""
    if method in ["bayes", "crr_studentized", "crr_standard", "crr_deleted"]:
        n_calib = get_ds_full_n[dataset_name]
    else:
        n_calib = int(1 + get_ds_full_n[dataset_name] * split_calib_size)
    l = np.floor((n_calib + 1) * sig_lvl).astype(int)
    beta_a = n_calib + 1 - l
    beta_b = l
    cov_lower = sp.stats.beta.ppf(0.01, a=beta_a, b=beta_b)
    cov_upper = sp.stats.beta.ppf(0.99, a=beta_a, b=beta_b)
    if (emp_cov >= cov_lower) and (emp_cov <= cov_upper):
        return True
    return False

## Print function that computes mean and standard deviation and checks for valid coverage

In [4]:
def print_results(res: dict):
    """Print the results from a dictionary created by a get_results_helper function."""
    # Coverage and width results have shape (num_methods, num_sig_lvls, num_seeds)
    for sig_lvl_idx, sig_lvl in enumerate(res["sig_lvls"]):
        print(f"Confidence level of {1-sig_lvl} \tAvg. Coverage \t\tAvg. Size")
        for method_idx, method in enumerate(res["methods"]):
            mu_cov = res["coverage"][method_idx, sig_lvl_idx, :].mean()
            sigma_cov = res["coverage"][method_idx, sig_lvl_idx, :].std()
            mu_size = np.mean(res["interval_widths_avg"][method_idx, sig_lvl_idx, :])
            sigma_size = np.ma.std(res["interval_widths_avg"][method_idx, sig_lvl_idx, :])
            if isinstance(sigma_size, np.ma.core.MaskedConstant):
                sigma_size = 0.0
            # Check if we get valid coverage
            valid_cov = cov_check(res["dataset"], sig_lvl, method, mu_cov, res["split_calib_size"])
            checkmark = "\u2713" if valid_cov else "\u2717"
            print(
                f"\t{method:<20} \t{mu_cov:>.4f} +/- {sigma_cov:>.4f} {checkmark} \t{mu_size:>.4f} +/- {sigma_size:>.4f}"
            )

## Example UCI

In [5]:
# Settings
SPLIT_CALIB_SIZE = 0.5
RES_PATH = "../results/uci/"  # Change the path as needed
DS_NAME = "boston"

In [6]:
# Get results for Split CP methods
res_split = get_results_helper(DS_NAME, RES_PATH, split_calib_size=SPLIT_CALIB_SIZE, split=True)
print_results(res_split)

Confidence level of 0.9 	Avg. Coverage 		Avg. Size
	scp_standard_50      	0.8956 +/- 0.0134 ✓ 	10.6348 +/- 0.3888
	scp_norm_var_50      	0.8974 +/- 0.0136 ✓ 	12.6299 +/- 1.1968
	scp_norm_std_50      	0.8952 +/- 0.0157 ✓ 	10.3008 +/- 0.2828
	scp_cqr_50           	0.9010 +/- 0.0106 ✓ 	11.6923 +/- 0.4083
	scp_crf_50           	0.8992 +/- 0.0098 ✓ 	43.4664 +/- 94.6179
Confidence level of 0.95 	Avg. Coverage 		Avg. Size
	scp_standard_50      	0.9464 +/- 0.0103 ✓ 	14.5094 +/- 0.5408
	scp_norm_var_50      	0.9442 +/- 0.0117 ✓ 	16.2063 +/- 1.5696
	scp_norm_std_50      	0.9482 +/- 0.0102 ✓ 	13.4183 +/- 0.4780
	scp_cqr_50           	0.9512 +/- 0.0077 ✓ 	15.1151 +/- 0.6727
	scp_crf_50           	0.9470 +/- 0.0066 ✓ 	56.1956 +/- 120.3924
Confidence level of 0.99 	Avg. Coverage 		Avg. Size
	scp_standard_50      	0.9911 +/- 0.0043 ✓ 	36.2722 +/- 5.8395
	scp_norm_var_50      	0.9901 +/- 0.0035 ✓ 	29.1162 +/- 2.5530
	scp_norm_std_50      	0.9905 +/- 0.0039 ✓ 	24.7139 +/- 2.7368
	scp_cqr_50           	

In [7]:
# Get results for Full CP methods
res_full = get_results_helper(DS_NAME, RES_PATH, split=False)
print_results(res_full)

Confidence level of 0.9 	Avg. Coverage 		Avg. Size
	bayes                	0.9092 +/- 0.0089 ✓ 	9.3882 +/- 0.2465
	crr_standard         	0.9032 +/- 0.0082 ✓ 	10.5581 +/- 0.6230
	crr_studentized      	0.9045 +/- 0.0107 ✓ 	9.1491 +/- 0.2512
	crr_deleted          	0.9051 +/- 0.0097 ✓ 	9.4303 +/- 0.1896
Confidence level of 0.95 	Avg. Coverage 		Avg. Size
	bayes                	0.9418 +/- 0.0054 ✓ 	11.1867 +/- 0.2938
	crr_standard         	0.9531 +/- 0.0039 ✓ 	14.0365 +/- 0.8418
	crr_studentized      	0.9545 +/- 0.0063 ✓ 	12.2180 +/- 0.3524
	crr_deleted          	0.9557 +/- 0.0048 ✓ 	12.9851 +/- 0.2629
Confidence level of 0.99 	Avg. Coverage 		Avg. Size
	bayes                	0.9766 +/- 0.0029 ✗ 	14.7018 +/- 0.3861
	crr_standard         	0.9905 +/- 0.0021 ✓ 	inf +/- 0.0000
	crr_studentized      	0.9917 +/- 0.0015 ✓ 	20.6007 +/- 0.5013
	crr_deleted          	0.9925 +/- 0.0012 ✓ 	30.6101 +/- 1.7633


In [8]:
# Get results for Full + Refine CP methods
res_full_refine = get_results_helper(
    DS_NAME, RES_PATH, split_calib_size=SPLIT_CALIB_SIZE, split=False
)
print_results(res_full_refine)

Confidence level of 0.9 	Avg. Coverage 		Avg. Size
	bayes                	0.9972 +/- 0.0010 ✗ 	33.5530 +/- 1.3632
	crr_standard         	0.9020 +/- 0.0118 ✓ 	19.0263 +/- 1.0602
	crr_studentized      	0.9012 +/- 0.0081 ✓ 	13.1022 +/- 0.2277
	crr_deleted          	0.8986 +/- 0.0113 ✓ 	13.4757 +/- 0.3004
Confidence level of 0.95 	Avg. Coverage 		Avg. Size
	bayes                	0.9978 +/- 0.0011 ✗ 	39.9809 +/- 1.6243
	crr_standard         	0.9539 +/- 0.0087 ✓ 	24.6442 +/- 1.6857
	crr_studentized      	0.9541 +/- 0.0063 ✓ 	16.7292 +/- 0.4239
	crr_deleted          	0.9518 +/- 0.0099 ✓ 	18.1909 +/- 0.8457
Confidence level of 0.99 	Avg. Coverage 		Avg. Size
	bayes                	0.9990 +/- 0.0013 ✗ 	52.5438 +/- 2.1347
	crr_standard         	0.9935 +/- 0.0027 ✓ 	inf +/- 0.0000
	crr_studentized      	0.9927 +/- 0.0032 ✓ 	27.5603 +/- 1.4054
	crr_deleted          	0.9899 +/- 0.0046 ✓ 	inf +/- 0.0000


## Example Pets

In [9]:
# Settings
SPLIT_CALIB_SIZE = 0.5
BACKBONE = "vgg19"
RES_PATH = f"../results/pet_{BACKBONE}/"  # Change the path as needed

In [10]:
res_full = get_results_helper_pets(RES_PATH, cp_type="full")
print_results(res_full)

Confidence level of 0.85 	Avg. Coverage 		Avg. Size
	bayes                	0.9211 +/- 0.0080 ✗ 	0.0071 +/- 0.0004
	crr_standard         	0.9087 +/- 0.0102 ✗ 	0.0040 +/- 0.0003
	crr_studentized      	0.9097 +/- 0.0088 ✗ 	0.0038 +/- 0.0002
	crr_deleted          	0.9097 +/- 0.0097 ✗ 	0.0041 +/- 0.0002
Confidence level of 0.9 	Avg. Coverage 		Avg. Size
	bayes                	0.9449 +/- 0.0082 ✗ 	0.0094 +/- 0.0006
	crr_standard         	0.9411 +/- 0.0080 ✗ 	0.0061 +/- 0.0004
	crr_studentized      	0.9409 +/- 0.0091 ✗ 	0.0061 +/- 0.0003
	crr_deleted          	0.9420 +/- 0.0097 ✗ 	0.0069 +/- 0.0003
Confidence level of 0.95 	Avg. Coverage 		Avg. Size
	bayes                	0.9665 +/- 0.0070 ✗ 	0.0141 +/- 0.0008
	crr_standard         	0.9740 +/- 0.0057 ✗ 	0.0115 +/- 0.0008
	crr_studentized      	0.9741 +/- 0.0072 ✗ 	0.0122 +/- 0.0008
	crr_deleted          	0.9728 +/- 0.0076 ✗ 	0.0153 +/- 0.0009


In [11]:
res_full_refine = get_results_helper_pets(
    RES_PATH, cp_type="full_refine", split_calib_size=SPLIT_CALIB_SIZE
)
print_results(res_full_refine)

Confidence level of 0.85 	Avg. Coverage 		Avg. Size
	bayes_50             	0.9912 +/- 0.0033 ✗ 	0.0348 +/- 0.0024
	crr_standard_50      	0.8736 +/- 0.0160 ✗ 	0.0040 +/- 0.0003
	crr_studentized_50   	0.8778 +/- 0.0166 ✗ 	0.0036 +/- 0.0003
	crr_deleted_50       	0.8809 +/- 0.0167 ✗ 	0.0040 +/- 0.0003
Confidence level of 0.9 	Avg. Coverage 		Avg. Size
	bayes_50             	0.9936 +/- 0.0033 ✗ 	0.0463 +/- 0.0032
	crr_standard_50      	0.9144 +/- 0.0143 ✗ 	0.0060 +/- 0.0006
	crr_studentized_50   	0.9175 +/- 0.0138 ✗ 	0.0056 +/- 0.0005
	crr_deleted_50       	0.9190 +/- 0.0131 ✗ 	0.0064 +/- 0.0006
Confidence level of 0.95 	Avg. Coverage 		Avg. Size
	bayes_50             	0.9959 +/- 0.0024 ✗ 	0.0688 +/- 0.0048
	crr_standard_50      	0.9556 +/- 0.0083 ✓ 	0.0110 +/- 0.0011
	crr_studentized_50   	0.9562 +/- 0.0075 ✓ 	0.0110 +/- 0.0013
	crr_deleted_50       	0.9606 +/- 0.0076 ✗ 	0.0144 +/- 0.0021


In [12]:
res_split = get_results_helper_pets(RES_PATH, cp_type="split", split_calib_size=SPLIT_CALIB_SIZE)
print_results(res_split)

Confidence level of 0.85 	Avg. Coverage 		Avg. Size
	scp_standard_50      	0.8736 +/- 0.0121 ✗ 	0.0118 +/- 0.0010
	scp_norm_var_50      	0.8628 +/- 0.0123 ✓ 	0.0151 +/- 0.0025
	scp_norm_std_50      	0.8684 +/- 0.0130 ✗ 	0.0115 +/- 0.0011
	scp_cqr_50           	0.8746 +/- 0.0137 ✗ 	0.0163 +/- 0.0012
	scp_crf_50           	0.8748 +/- 0.0152 ✗ 	0.0125 +/- 0.0013
Confidence level of 0.9 	Avg. Coverage 		Avg. Size
	scp_standard_50      	0.9187 +/- 0.0120 ✗ 	0.0172 +/- 0.0016
	scp_norm_var_50      	0.9089 +/- 0.0121 ✓ 	0.0210 +/- 0.0034
	scp_norm_std_50      	0.9135 +/- 0.0137 ✗ 	0.0162 +/- 0.0015
	scp_cqr_50           	0.9142 +/- 0.0124 ✗ 	0.0274 +/- 0.0023
	scp_crf_50           	0.9196 +/- 0.0111 ✗ 	0.0190 +/- 0.0020
Confidence level of 0.95 	Avg. Coverage 		Avg. Size
	scp_standard_50      	0.9591 +/- 0.0087 ✗ 	0.0314 +/- 0.0036
	scp_norm_var_50      	0.9554 +/- 0.0107 ✓ 	0.0334 +/- 0.0054
	scp_norm_std_50      	0.9556 +/- 0.0092 ✓ 	0.0264 +/- 0.0024
	scp_cqr_50           	0.9597 +/- 0.006