# Benchmark NICE as calibration method

In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
import os
import sys
import time
import importlib
import collections
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.isotonic import IsotonicRegression
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input

pd.set_option('colheader_justify', 'center')

%aimport utils
%aimport utils.ops
%aimport utils.metrics
%aimport utils.visualization
%aimport utils.data
%aimport flows.nice
%aimport flows.normalizing_flows
%aimport calibrators
from utils.ops import onehot_encode, optim_temperature, detection_log_likelihood_ratios
from utils.metrics import neg_log_likelihood, accuracy, expected_calibration_error
from utils.visualization import plot_pdf_simplex, plot_prob_simplex, reliability_plot, ECE_plot
from utils.data import get_cifar10
from flows.nice import NiceFlow
from calibrators import DummyCalibrator, PAVCalibrator, TempScalingCalibrator, MLRCalibrator
from calibrators import NiceCalibrator, PlanarFlowCalibrator, RadialFlowCalibrator, RealNvpCalibrator

In [4]:
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

In [5]:
def highlight_min(s):
    is_min = s == s.min()
    return ['font-weight: bold' if v else '' for v in is_min]

## CIFAR-100

In [6]:
models = [
    'wide-resnet-28x10',
    'densenet-121',
    'densenet-169',
    'resnet-101',
    'vgg-19',
    'preactresnet-18',
    'preactresnet-164',
    'resnext-29_8x16',
    'wide-resnet-40x10',
]

In [7]:
def score(calibrator, logits, target):
    probs = calibrator.predict(logits)
    nll = neg_log_likelihood(probs, target)
    ece = expected_calibration_error(probs, target, bins=15)
    acc = accuracy(probs, target)
    
    metrics = {
        'NLL': nll,
        'ECE': ece,
        'Accuracy': acc,
    }
    
    return metrics

In [8]:
def train_calibrator(Calibrator, logits, target):
    cal = Calibrator(logits, target)
    return cal

In [9]:
def train_and_evaluate_calibrators(logits, target, test_logits, test_target, calibrators, **kwargs):
    ## Train NICE on test set to obtain NLLmin
    nice_ref_cal = NiceCalibrator(test_logits, test_target, **kwargs['nice_ref_args'])
    ref_results = score(nice_ref_cal, test_logits, test_target)

    results = collections.OrderedDict()
    for cal, Calibrator in calibrators.items():
        t0 = time.time()
        model = train_calibrator(Calibrator, logits, target)
        t1 = time.time() - t0
        print("Calibrator {} fitted in {:.2f}s".format(cal, t1))
        results[cal] = {'Training time': t1,
                        'Validation': score(model, logits, target),
                        'Test': score(model, test_logits, test_target)}
    
    return results, ref_results

In [10]:
results = collections.OrderedDict()
ref_results = collections.OrderedDict()

calibrators = {
    'Uncalibrated': DummyCalibrator,
    'Temp-Scaling': TempScalingCalibrator,
    'MLR': MLRCalibrator,
}

nice_args = [
    {
        'layers': 2,
        'hidden_size': [1],
        'epochs': 500,
    }, {
        'layers': 2,
        'hidden_size': [5],
        'epochs': 500,
    }, {
        'layers': 4,
        'hidden_size': [100, 100],
        'epochs': 1000,
    },
]

nvp_args = [
    {
        'layers': 2,
        'hidden_size': [1],
        'epochs': 500,
    }, {
        'layers': 2,
        'hidden_size': [5],
        'epochs': 500,
    }, {
        'layers': 4,
        'hidden_size': [100, 100],
        'epochs': 1000,
    },
]

planar_args = [
    {
        'layers': 5,
        'epochs': 2000,
    },{
        'layers': 10,
        'epochs': 2000,
    },{
        'layers': 20,
        'epochs': 2000,
    }
]

radial_args = [
    {
        'layers': 5,
        'epochs': 2000,
    },{
        'layers': 10,
        'epochs': 2000,
    },{
        'layers': 20,
        'epochs': 2000,
    }
]
    

for nice in nice_args:
    name = 'NICE_l{}_hs{}'.format(nice['layers'], nice['hidden_size'])
    calibrators[name] = lambda logits, target: NiceCalibrator(logits, target, **nice)
    
for nvp in nvp_args:
    name = 'RealNVP_l{}_hs{}'.format(nvp['layers'], nvp['hidden_size'])
    calibrators[name] = lambda logits, target: RealNvpCalibrator(logits, target, **nvp)

for planar in planar_args:
    name = 'Planar_l{}'.format(planar['layers'])
    calibrators[name] = lambda logits, target: PlanarFlowCalibrator(logits, target, **planar)
    
for radial in radial_args:
    name = 'Radial_l{}'.format(radial['layers'])
    calibrators[name] = lambda logits, target: RadialFlowCalibrator(logits, target, **radial)
    

nice_ref_args = {
    'layers': 4,
    'hidden_size': [100, 100],
    'epochs': 1000,
}

kwargs = {
    'nice_ref_args': nice_ref_args,
}

for model in models:
    print("Calibrating model: {}".format(model))
    data_path = os.path.join('../data', model+'_cifar100')
    prefix = os.path.join(data_path, 'cifar100_'+model)
    
    logits = np.load(prefix + '_logit_prediction_valid.npy')
    test_logits = np.load(prefix + '_logit_prediction_test.npy')
    
    target = np.load(prefix + '_true_valid.npy')
    test_target = np.load(prefix + '_true_test.npy')

    results[model], ref_results[model] = train_and_evaluate_calibrators(logits, 
                                                                        target, 
                                                                        test_logits, 
                                                                        test_target, 
                                                                        calibrators,
                                                                        **kwargs)
    

Calibrating model: wide-resnet-28x10
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 0.24s
Calibrator MLR fitted in 1.21s


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calibrator NICE_l2_hs[2] fitted in 205.06s
Calibrator NICE_l2_hs[5] fitted in 207.19s
Calibrator NICE_l4_hs[100, 100] fitted in 210.30s
Calibrator RealNVP_l2_hs[2] fitted in 255.18s
Calibrator RealNVP_l2_hs[5] fitted in 258.74s
Calibrator RealNVP_l4_hs[100, 100] fitted in 267.14s
Calibrator Planar_l5 fitted in 923.59s
Calibrator Planar_l10 fitted in 938.21s
Calibrator Planar_l20 fitted in 953.98s
Calibrator Radial_l5 fitted in 1025.51s
Calibrator Radial_l10 fitted in 1044.38s
Calibrator Radial_l20 fitted in 1056.34s
Calibrating model: densenet-121
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 0.80s
Calibrator MLR fitted in 1.24s


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calibrator NICE_l2_hs[2] fitted in 287.98s
Calibrator NICE_l2_hs[5] fitted in 291.81s
Calibrator NICE_l4_hs[100, 100] fitted in 296.06s
Calibrator RealNVP_l2_hs[2] fitted in 384.57s
Calibrator RealNVP_l2_hs[5] fitted in 383.10s
Calibrator RealNVP_l4_hs[100, 100] fitted in 399.63s
Calibrator Planar_l5 fitted in 1123.56s
Calibrator Planar_l10 fitted in 1151.44s
Calibrator Planar_l20 fitted in 1177.65s
Calibrator Radial_l5 fitted in 1173.77s
Calibrator Radial_l10 fitted in 1193.64s
Calibrator Radial_l20 fitted in 1203.61s
Calibrating model: resnet-101
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 0.70s
Calibrator MLR fitted in 1.56s


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calibrator NICE_l2_hs[2] fitted in 385.55s
Calibrator NICE_l2_hs[5] fitted in 390.43s
Calibrator NICE_l4_hs[100, 100] fitted in 393.33s
Calibrator RealNVP_l2_hs[2] fitted in 612.77s
Calibrator RealNVP_l2_hs[5] fitted in 612.71s
Calibrator RealNVP_l4_hs[100, 100] fitted in 627.83s
Calibrator Planar_l5 fitted in 1375.18s
Calibrator Planar_l10 fitted in 1400.18s
Calibrator Planar_l20 fitted in 1420.77s
Calibrator Radial_l5 fitted in 1321.90s
Calibrator Radial_l10 fitted in 1334.66s
Calibrator Radial_l20 fitted in 1360.29s
Calibrating model: vgg-19
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 0.59s
Calibrator MLR fitted in 1.16s


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calibrator NICE_l2_hs[2] fitted in 483.32s
Calibrator NICE_l2_hs[5] fitted in 487.76s
Calibrator NICE_l4_hs[100, 100] fitted in 491.63s
Calibrator RealNVP_l2_hs[2] fitted in 619.62s
Calibrator RealNVP_l2_hs[5] fitted in 602.58s
Calibrator RealNVP_l4_hs[100, 100] fitted in 585.12s
Calibrator Planar_l5 fitted in 1614.79s
Calibrator Planar_l10 fitted in 1642.84s
Calibrator Planar_l20 fitted in 1667.81s
Calibrator Radial_l5 fitted in 1483.53s
Calibrator Radial_l10 fitted in 1494.77s
Calibrator Radial_l20 fitted in 1537.75s
Calibrating model: resnext-29_8x16
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 0.75s
Calibrator MLR fitted in 1.48s


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Calibrator NICE_l2_hs[2] fitted in 589.09s
Calibrator NICE_l2_hs[5] fitted in 593.20s
Calibrator NICE_l4_hs[100, 100] fitted in 597.81s
Calibrator RealNVP_l2_hs[2] fitted in 598.01s
Calibrator RealNVP_l2_hs[5] fitted in 600.66s
Calibrator RealNVP_l4_hs[100, 100] fitted in 604.74s
Calibrator Planar_l5 fitted in 1800.78s
Calibrator Planar_l10 fitted in 1842.04s
Calibrator Planar_l20 fitted in 1865.35s
Calibrator Radial_l5 fitted in 1648.72s
Calibrator Radial_l10 fitted in 1658.17s
Calibrator Radial_l20 fitted in 1678.29s


In [11]:
ece_val_results = {}
ece_test_results = {}

acc_val_results = {}
acc_test_results = {}

nll_val_results = {}
nll_test_results = {}

for model, model_results in results.items():
    ece_val_results[model] = {}
    ece_test_results[model] = {}
    acc_val_results[model] = {}
    acc_test_results[model] = {}
    nll_val_results[model] = {}
    nll_test_results[model] = {}
    for cal, cal_results in model_results.items():
        ece_test_results[model][cal] = cal_results['Test']['ECE']
        ece_val_results[model][cal] = cal_results['Validation']['ECE']
        
        acc_test_results[model][cal] = cal_results['Test']['Accuracy']
        acc_val_results[model][cal] = cal_results['Validation']['Accuracy']
        
        nll_test_results[model][cal] = cal_results['Test']['NLL']
        nll_val_results[model][cal] = cal_results['Validation']['NLL']

**Results on the test set:**

In [12]:
df = pd.concat([pd.DataFrame.from_dict(acc_test_results, orient='columns'),
                pd.DataFrame.from_dict(ece_test_results, orient='columns'),
                pd.DataFrame.from_dict(nll_test_results, orient='columns')],
               axis=1,keys=['ACC','ECE', 'NLL']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[models].loc[list(calibrators.keys())]
df.style.set_properties(**{'text-align': 'center'})\
    .set_caption('CIFAR100 Test set')\
    .apply(highlight_max, subset=[(model, 'ACC') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'ECE') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'NLL') for model in df.columns.levels[0]])\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10,wide-resnet-28x10,wide-resnet-28x10,densenet-121,densenet-121,densenet-121,resnet-101,resnet-101,resnet-101,vgg-19,vgg-19,vgg-19,resnext-29_8x16,resnext-29_8x16,resnext-29_8x16
Unnamed: 0_level_1,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL
Uncalibrated,0.8039,0.0485303,0.817315,0.788,0.0872402,0.893708,0.72,0.114132,1.1341,0.727,0.176313,1.54045,0.7788,0.0967838,0.938889
Temp-Scaling,0.8039,0.0428475,0.813448,0.788,0.0352327,0.83548,0.72,0.0150577,1.00067,0.727,0.0480832,1.19965,0.7788,0.0281907,0.822031
MLR,0.8041,0.0422879,0.801043,0.7859,0.0356487,0.837193,0.7196,0.0213178,1.0025,0.7245,0.0394123,1.20897,0.7754,0.0274381,0.824304
NICE_l2_hs[2],0.7314,0.23645,3.1146,0.7032,0.258463,3.42523,0.5983,0.344847,4.52334,0.6469,0.31061,4.38338,0.6541,0.293076,3.70366
NICE_l2_hs[5],0.7388,0.228795,3.10888,0.7083,0.254194,3.42933,0.5936,0.344031,4.39339,0.6438,0.3134,4.37249,0.6563,0.291426,3.69575
"NICE_l4_hs[100, 100]",0.7346,0.231491,3.11128,0.7043,0.256147,3.40493,0.6048,0.337307,4.45166,0.6442,0.312019,4.42039,0.654,0.293424,3.64973
RealNVP_l2_hs[2],0.7395,0.230163,3.02327,0.7073,0.267025,3.65945,0.605,0.343502,4.43656,0.6573,0.306623,4.26656,0.6537,0.29819,3.67363
RealNVP_l2_hs[5],0.7388,0.232228,3.09131,0.7076,0.258978,3.31061,0.6052,0.34267,4.33277,0.6546,0.311063,4.37601,0.6406,0.317245,4.03552
"RealNVP_l4_hs[100, 100]",0.7373,0.231421,2.98086,0.7069,0.256342,3.26155,0.5899,0.360212,4.67202,0.6578,0.30533,4.23205,0.644,0.31554,3.99786
Planar_l5,0.7257,0.134784,1.46739,0.6969,0.159314,1.64063,0.6142,0.198277,1.93259,0.6571,0.14147,1.86142,0.6766,0.180067,1.80759


In [13]:
nll_df = pd.DataFrame.from_dict(nll_test_results, orient='columns')
min_nll_df = pd.DataFrame.from_dict(ref_results, orient='columns').loc['NLL']
nll_cal_df = nll_df.subtract(min_nll_df, axis='columns')

df = pd.concat([nll_df, nll_cal_df], axis=1,keys=['NLL','NLL_cal']).swaplevel(0,1,axis=1).sort_index(axis=1)
df = df[models].loc[list(calibrators.keys())]

df.rename(columns={model: model + ' NLL_min={:.4f}'.format(min_nll_df[model]) 
                   for model in df.columns.levels[0]}, level=0, inplace=True)

df.style.set_properties(**{'text-align': 'center'})\
    .format("{:.4f}")\
    .set_caption('CIFAR100 NLL decomposition')\
    .apply(highlight_min)\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10 NLL_min=0.0016,wide-resnet-28x10 NLL_min=0.0016,densenet-121 NLL_min=0.0004,densenet-121 NLL_min=0.0004,resnet-101 NLL_min=0.0013,resnet-101 NLL_min=0.0013,vgg-19 NLL_min=0.0030,vgg-19 NLL_min=0.0030,resnext-29_8x16 NLL_min=0.0005,resnext-29_8x16 NLL_min=0.0005
Unnamed: 0_level_1,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal
Uncalibrated,0.8173,0.8157,0.8937,0.8933,1.1341,1.1328,1.5404,1.5375,0.9389,0.9384
Temp-Scaling,0.8134,0.8119,0.8355,0.8351,1.0007,0.9993,1.1997,1.1967,0.822,0.8215
MLR,0.801,0.7995,0.8372,0.8368,1.0025,1.0012,1.209,1.206,0.8243,0.8238
NICE_l2_hs[2],3.1146,3.113,3.4252,3.4248,4.5233,4.522,4.3834,4.3804,3.7037,3.7032
NICE_l2_hs[5],3.1089,3.1073,3.4293,3.4289,4.3934,4.392,4.3725,4.3695,3.6957,3.6953
"NICE_l4_hs[100, 100]",3.1113,3.1097,3.4049,3.4045,4.4517,4.4503,4.4204,4.4174,3.6497,3.6492
RealNVP_l2_hs[2],3.0233,3.0217,3.6594,3.659,4.4366,4.4352,4.2666,4.2636,3.6736,3.6731
RealNVP_l2_hs[5],3.0913,3.0897,3.3106,3.3102,4.3328,4.3314,4.376,4.373,4.0355,4.035
"RealNVP_l4_hs[100, 100]",2.9809,2.9793,3.2616,3.2611,4.672,4.6707,4.2321,4.2291,3.9979,3.9974
Planar_l5,1.4674,1.4658,1.6406,1.6402,1.9326,1.9312,1.8614,1.8584,1.8076,1.8071


**Results on the validation set:**

In [14]:
df = pd.concat([pd.DataFrame.from_dict(acc_val_results, orient='columns'),
                pd.DataFrame.from_dict(ece_val_results, orient='columns'),
                pd.DataFrame.from_dict(nll_val_results, orient='columns')],
               axis=1,keys=['ACC','ECE', 'NLL']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[models].loc[list(calibrators.keys())]
df.style.set_properties(**{'text-align': 'center'})\
    .set_caption('CIFAR100 Validation set')\
    .apply(highlight_max, subset=[(model, 'ACC') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'ECE') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'NLL') for model in df.columns.levels[0]])\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10,wide-resnet-28x10,wide-resnet-28x10,densenet-121,densenet-121,densenet-121,resnet-101,resnet-101,resnet-101,vgg-19,vgg-19,vgg-19,resnext-29_8x16,resnext-29_8x16,resnext-29_8x16
Unnamed: 0_level_1,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL
Uncalibrated,0.7994,0.053124,0.819501,0.7826,0.0887537,0.940488,0.7218,0.110705,1.13483,0.7162,0.189706,1.5726,0.775,0.101819,0.981964
Temp-Scaling,0.7994,0.0484992,0.815698,0.7826,0.0417907,0.873531,0.7218,0.0265887,1.00431,0.7162,0.0541274,1.2135,0.775,0.0292295,0.852484
MLR,0.8064,0.0437405,0.778041,0.784,0.0395841,0.852308,0.725,0.0208451,0.984427,0.7188,0.0483676,1.19587,0.7806,0.028018,0.829799
NICE_l2_hs[2],1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07
NICE_l2_hs[5],1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,0.9998,0.0002,0.0032235
"NICE_l4_hs[100, 100]",1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,0.9998,0.00011205,0.0032235,1.0,0.0,-1.19209e-07,0.9998,0.0002,0.0032235
RealNVP_l2_hs[2],1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,0.9996,0.0004,0.00644712
RealNVP_l2_hs[5],0.9998,0.0002,0.0032235,0.9998,0.0002,0.0032235,1.0,0.0,-1.19209e-07,1.0,0.0,-1.19209e-07,0.9998,0.0002,0.0032235
"RealNVP_l4_hs[100, 100]",0.9998,0.0002,0.0032235,1.0,0.0,-1.19209e-07,0.9996,0.0004,0.00644712,1.0,0.0,-1.19209e-07,0.9998,0.0002,0.0032235
Planar_l5,0.9796,0.0699263,0.134684,0.9882,0.0690556,0.108039,0.9378,0.0921681,0.278509,0.8692,0.0645347,0.495031,0.9916,0.0631541,0.094549
