# Benchmark NICE as calibration method

In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
import os
import sys
import time
import importlib
import functools
import concurrent
import collections
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.isotonic import IsotonicRegression
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input
import tensorflow.keras.backend as K

pd.set_option('colheader_justify', 'center')

%aimport utils
%aimport utils.ops
%aimport utils.metrics
%aimport utils.visualization
%aimport utils.data
%aimport flows.nice
%aimport flows.normalizing_flows
%aimport calibrators
from utils.ops import onehot_encode, optim_temperature, detection_log_likelihood_ratios
from utils.metrics import neg_log_likelihood, accuracy, expected_calibration_error
from utils.visualization import plot_pdf_simplex, plot_prob_simplex, reliability_plot, ECE_plot
from utils.data import get_cifar10
from flows.nice import NiceFlow
from calibrators import DummyCalibrator, PAVCalibrator, TempScalingCalibrator, MLRCalibrator
from calibrators import NiceCalibrator, PlanarFlowCalibrator, RadialFlowCalibrator, RealNvpCalibrator

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

In [5]:
def highlight_min(s):
    is_min = s == s.min()
    return ['font-weight: bold' if v else '' for v in is_min]

### Helper methods:

In [6]:
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [7]:
def score(calibrator, logits, target):
    probs = calibrator.predict(logits)
    nll = neg_log_likelihood(probs, target)
    ece = expected_calibration_error(probs, target, bins=15)
    acc = accuracy(probs, target)
    
    metrics = {
        'NLL': nll,
        'ECE': ece,
        'Accuracy': acc,
    }
    
    return metrics

In [8]:
def train_calibrator(Calibrator, logits, target):
    cal = Calibrator(logits, target)
    return cal

In [9]:
if len(get_available_gpus())>1:
    def train_and_evaluate_calibrators(gpu, logits, target, test_logits, test_target, calibrators, **kwargs):
        
        ## Train NICE on test set to obtain NLLmin
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True), graph=tf.Graph()) as sess:
            K.set_session(sess)
            with tf.device(gpu):
                nice_ref_cal = NiceCalibrator(test_logits, test_target, **kwargs['nice_ref_args'])
                ref_results = score(nice_ref_cal, test_logits, test_target)

        results = collections.OrderedDict()
        for cal, Calibrator in calibrators.items():
            with tf.Session(config=tf.ConfigProto(allow_soft_placement=True), graph=tf.Graph()) as sess:
                K.set_session(sess)
                with tf.device(gpu):
                    t0 = time.time()
                    model = train_calibrator(Calibrator, logits, target)
                    t1 = time.time() - t0
                    print("Calibrator {} fitted in {:.2f}s".format(cal, t1))
                    results[cal] = {'Training time': t1,
                                    'Validation': score(model, logits, target),
                                    'Test': score(model, test_logits, test_target)}

        return results, ref_results
    
else:
    
    def train_and_evaluate_calibrators(logits, target, test_logits, test_target, calibrators, **kwargs):
        ## Train NICE on test set to obtain NLLmin
        nice_ref_cal = NiceCalibrator(test_logits, test_target, **kwargs['nice_ref_args'])
        ref_results = score(nice_ref_cal, test_logits, test_target)

        results = collections.OrderedDict()
        for cal, Calibrator in calibrators.items():
            t0 = time.time()
            model = train_calibrator(Calibrator, logits, target)
            t1 = time.time() - t0
            print("Calibrator {} fitted in {:.2f}s".format(cal, t1))
            results[cal] = {'Training time': t1,
                            'Validation': score(model, logits, target),
                            'Test': score(model, test_logits, test_target)}

        return results, ref_results

In [10]:
def calibrate_models_gpu(args):
    gpu, models = args
    for model in models:
        print("Calibrating model: {}".format(model))
        data_path = os.path.join('../data', model+'_cifar100')
        prefix = os.path.join(data_path, 'cifar100_'+model)

        logits = np.load(prefix + '_logit_prediction_valid.npy')
        test_logits = np.load(prefix + '_logit_prediction_test.npy')

        target = np.load(prefix + '_true_valid.npy')
        test_target = np.load(prefix + '_true_test.npy')

        results[model], ref_results[model] = train_and_evaluate_calibrators(gpu,
                                                                            logits, 
                                                                            target, 
                                                                            test_logits, 
                                                                            test_target, 
                                                                            calibrators,
                                                                            **kwargs)
    return results, ref_results

## CIFAR-100

In [11]:
gpus = get_available_gpus()

models = [
    'wide-resnet-28x10',
    'densenet-121',
    'densenet-169',
    'resnet-101',
    'vgg-19',
    'preactresnet-18',
    'preactresnet-164',
    'resnext-29_8x16',
    'wide-resnet-40x10',
]
models = [
    'wide-resnet-28x10',
    'densenet-121',
    'densenet-169',
    'resnet-101',
]

In [12]:
results = collections.OrderedDict()
ref_results = collections.OrderedDict()

calibrators = {
    'Uncalibrated': DummyCalibrator,
    'Temp-Scaling': TempScalingCalibrator,
    'MLR': MLRCalibrator,
}

nice_args = [
    {
        'layers': 2,
        'hidden_size': [1],
        'epochs': 500,
    }, {
        'layers': 2,
        'hidden_size': [5],
        'epochs': 500,
    }, {
        'layers': 4,
        'hidden_size': [100, 100],
        'epochs': 1000,
    },
]

nvp_args = [
    {
        'layers': 2,
        'hidden_size': [1],
        'epochs': 500,
    }, {
        'layers': 2,
        'hidden_size': [5],
        'epochs': 500,
    }, {
        'layers': 4,
        'hidden_size': [100, 100],
        'epochs': 1000,
    },
]

planar_args = [
    {
        'layers': 5,
        'epochs': 2000,
    },{
        'layers': 10,
        'epochs': 2000,
    },{
        'layers': 20,
        'epochs': 2000,
    }
]

radial_args = [
    {
        'layers': 5,
        'epochs': 2000,
    },{
        'layers': 10,
        'epochs': 2000,
    },{
        'layers': 20,
        'epochs': 2000,
    }
]

    
"""
for nice in nice_args:
    name = 'NICE_l{}_hs{}'.format(nice['layers'], nice['hidden_size'])
    calibrators[name] = lambda logits, target: NiceCalibrator(logits, target, **nice)
    
for nvp in nvp_args:
    name = 'RealNVP_l{}_hs{}'.format(nvp['layers'], nvp['hidden_size'])
    calibrators[name] = lambda logits, target: RealNvpCalibrator(logits, target, **nvp)

for planar in planar_args:
    name = 'Planar_l{}'.format(planar['layers'])
    calibrators[name] = lambda logits, target: PlanarFlowCalibrator(logits, target, **planar)
"""
for radial in radial_args:
    name = 'Radial_l{}'.format(radial['layers'])
    calibrators[name] = lambda logits, target: RadialFlowCalibrator(logits, target, **radial)
    

nice_ref_args = {
    'layers': 4,
    'hidden_size': [100, 100],
    'epochs': 1000,
}

kwargs = {
    'nice_ref_args': nice_ref_args,
}

if len(gpus)<2:
    for model in models:
        data_path = os.path.join('../data', model+'_cifar100')
        prefix = os.path.join(data_path, 'cifar100_'+model)

        logits = np.load(prefix + '_logit_prediction_valid.npy')
        test_logits = np.load(prefix + '_logit_prediction_test.npy')

        target = np.load(prefix + '_true_valid.npy')
        test_target = np.load(prefix + '_true_test.npy')

        results[model], ref_results[model] = train_and_evaluate_calibrators(logits, 
                                                                            target, 
                                                                            test_logits, 
                                                                            test_target, 
                                                                            calibrators,
                                                                            **kwargs)
else:
    n = len(models)//len(gpus)
    zipped_models = [models[i*n:(i+1)*n] for i in range((len(models) + n - 1) // n)]
    with concurrent.futures.ThreadPoolExecutor(len(gpus)) as executor:
        results, results_ref = functools.reduce(lambda x, y: [{**x[0], **y[0]}, {**x[1], **y[1]}], 
                                                [(r, r_ref) for r, r_ref in executor.map(calibrate_models_gpu, zip(gpus, zipped_models))],
                                                [{}, {}])
        print('results: ', results)


Calibrating model: wide-resnet-28x10
Calibrating model: densenet-169
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 1.01s
Calibrator MLR fitted in 5.24s
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 4.03s
Calibrator MLR fitted in 11.05s
Calibrator Radial_l5 fitted in 1421.64s
Calibrator Radial_l5 fitted in 1445.25s
Calibrator Radial_l10 fitted in 1446.86s


Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E354F64048>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.
Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E35EEA06A0>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi

Calibrator Radial_l10 fitted in 1455.74s


Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E35EE83128>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.
Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E365C2B320>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi

Calibrator Radial_l20 fitted in 1482.68s
Calibrating model: densenet-121
Calibrator Radial_l20 fitted in 1494.32s
Calibrating model: resnet-101
Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 4.04s
Calibrator MLR fitted in 5.88s


Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E0B66B9438>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.
Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E0B6805128>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi

Calibrator Uncalibrated fitted in 0.00s
Calibrator Temp-Scaling fitted in 3.94s
Calibrator MLR fitted in 6.82s
Calibrator Radial_l5 fitted in 1407.75s
Calibrator Radial_l5 fitted in 1427.54s
Calibrator Radial_l10 fitted in 1419.02s
Calibrator Radial_l10 fitted in 1442.50s


Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E379D0D048>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.
Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x000001E37DD4B320>>
Traceback (most recent call last):
  File "C:\Users\sergi\Anaconda3\envs\calib\lib\site-packages\tensorflow\python\client\session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "C:\Users\sergi

Calibrator Radial_l20 fitted in 1431.20s
Calibrator Radial_l20 fitted in 1445.92s
results:  {'wide-resnet-28x10': OrderedDict([('Uncalibrated', {'Training time': 0.002991914749145508, 'Validation': {'NLL': 0.8195012069079306, 'ECE': 0.05312395484149458, 'Accuracy': 0.7994}, 'Test': {'NLL': 0.817314659817216, 'ECE': 0.048530271295458054, 'Accuracy': 0.8039}}), ('Temp-Scaling', {'Training time': 1.0063085556030273, 'Validation': {'NLL': 0.8156984518259202, 'ECE': 0.04849923598095774, 'Accuracy': 0.7994}, 'Test': {'NLL': 0.8134477968198074, 'ECE': 0.042847461207210995, 'Accuracy': 0.8039}}), ('MLR', {'Training time': 5.238993167877197, 'Validation': {'NLL': 0.7780407887568063, 'ECE': 0.04374047675362761, 'Accuracy': 0.8064}, 'Test': {'NLL': 0.8010431549744933, 'ECE': 0.04228785558311745, 'Accuracy': 0.8041}}), ('Radial_l5', {'Training time': 1421.6434490680695, 'Validation': {'NLL': 0.7105920608332322, 'ECE': 0.027069147253036527, 'Accuracy': 0.8136}, 'Test': {'NLL': 0.7717742568379763, '

In [13]:
ece_val_results = {}
ece_test_results = {}

acc_val_results = {}
acc_test_results = {}

nll_val_results = {}
nll_test_results = {}

for model, model_results in results.items():
    ece_val_results[model] = {}
    ece_test_results[model] = {}
    acc_val_results[model] = {}
    acc_test_results[model] = {}
    nll_val_results[model] = {}
    nll_test_results[model] = {}
    for cal, cal_results in model_results.items():
        ece_test_results[model][cal] = cal_results['Test']['ECE']
        ece_val_results[model][cal] = cal_results['Validation']['ECE']
        
        acc_test_results[model][cal] = cal_results['Test']['Accuracy']
        acc_val_results[model][cal] = cal_results['Validation']['Accuracy']
        
        nll_test_results[model][cal] = cal_results['Test']['NLL']
        nll_val_results[model][cal] = cal_results['Validation']['NLL']

**Results on the test set:**

In [14]:
df = pd.concat([pd.DataFrame.from_dict(acc_test_results, orient='columns'),
                pd.DataFrame.from_dict(ece_test_results, orient='columns'),
                pd.DataFrame.from_dict(nll_test_results, orient='columns')],
               axis=1,keys=['ACC','ECE', 'NLL']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[models].loc[list(calibrators.keys())]
df.style.set_properties(**{'text-align': 'center'})\
    .set_caption('CIFAR100 Test set')\
    .apply(highlight_max, subset=[(model, 'ACC') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'ECE') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'NLL') for model in df.columns.levels[0]])\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10,wide-resnet-28x10,wide-resnet-28x10,densenet-121,densenet-121,densenet-121,densenet-169,densenet-169,densenet-169,resnet-101,resnet-101,resnet-101
Unnamed: 0_level_1,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL
Uncalibrated,0.8039,0.0485303,0.817315,0.788,0.0872402,0.893708,0.7905,0.0888329,0.874644,0.72,0.114132,1.1341
Temp-Scaling,0.8039,0.0428475,0.813448,0.788,0.0352327,0.83548,0.7905,0.0377451,0.815562,0.72,0.0150577,1.00067
MLR,0.8041,0.0422879,0.801043,0.7859,0.0356487,0.837193,0.7884,0.0380037,0.81874,0.7196,0.0213178,1.0025
Radial_l5,0.802,0.0383393,0.771774,0.7859,0.0349891,0.827677,0.7886,0.039424,0.811474,0.7179,0.0308361,1.01165
Radial_l10,0.8009,0.040976,0.777598,0.7851,0.0349326,0.822581,0.7884,0.0376116,0.811196,0.7172,0.0286029,1.0121
Radial_l20,0.8033,0.0379528,0.772337,0.7838,0.0326918,0.818417,0.7884,0.0398781,0.813292,0.7173,0.0294414,1.0118


In [15]:
nll_df = pd.DataFrame.from_dict(nll_test_results, orient='columns')
min_nll_df = pd.DataFrame.from_dict(ref_results, orient='columns').loc['NLL']
nll_cal_df = nll_df.subtract(min_nll_df, axis='columns')

df = pd.concat([nll_df, nll_cal_df], axis=1,keys=['NLL','NLL_cal']).swaplevel(0,1,axis=1).sort_index(axis=1)
df = df[models].loc[list(calibrators.keys())]

df.rename(columns={model: model + ' NLL_min={:.4f}'.format(min_nll_df[model]) 
                   for model in df.columns.levels[0]}, level=0, inplace=True)

df.style.set_properties(**{'text-align': 'center'})\
    .format("{:.4f}")\
    .set_caption('CIFAR100 NLL decomposition')\
    .apply(highlight_min)\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10 NLL_min=0.0078,wide-resnet-28x10 NLL_min=0.0078,densenet-121 NLL_min=0.0010,densenet-121 NLL_min=0.0010,densenet-169 NLL_min=0.0005,densenet-169 NLL_min=0.0005,resnet-101 NLL_min=0.0024,resnet-101 NLL_min=0.0024
Unnamed: 0_level_1,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal
Uncalibrated,0.8173,0.8095,0.8937,0.8927,0.8746,0.8741,1.1341,1.1317
Temp-Scaling,0.8134,0.8056,0.8355,0.8344,0.8156,0.8151,1.0007,0.9983
MLR,0.801,0.7932,0.8372,0.8362,0.8187,0.8182,1.0025,1.0002
Radial_l5,0.7718,0.7639,0.8277,0.8266,0.8115,0.811,1.0117,1.0093
Radial_l10,0.7776,0.7698,0.8226,0.8215,0.8112,0.8107,1.0121,1.0098
Radial_l20,0.7723,0.7645,0.8184,0.8174,0.8133,0.8128,1.0118,1.0094


**Results on the validation set:**

In [16]:
df = pd.concat([pd.DataFrame.from_dict(acc_val_results, orient='columns'),
                pd.DataFrame.from_dict(ece_val_results, orient='columns'),
                pd.DataFrame.from_dict(nll_val_results, orient='columns')],
               axis=1,keys=['ACC','ECE', 'NLL']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[models].loc[list(calibrators.keys())]
df.style.set_properties(**{'text-align': 'center'})\
    .set_caption('CIFAR100 Validation set')\
    .apply(highlight_max, subset=[(model, 'ACC') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'ECE') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'NLL') for model in df.columns.levels[0]])\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10,wide-resnet-28x10,wide-resnet-28x10,densenet-121,densenet-121,densenet-121,densenet-169,densenet-169,densenet-169,resnet-101,resnet-101,resnet-101
Unnamed: 0_level_1,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL
Uncalibrated,0.7994,0.053124,0.819501,0.7826,0.0887537,0.940488,0.7868,0.0894108,0.910888,0.7218,0.110705,1.13483
Temp-Scaling,0.7994,0.0484992,0.815698,0.7826,0.0417907,0.873531,0.7868,0.0340097,0.846234,0.7218,0.0265887,1.00431
MLR,0.8064,0.0437405,0.778041,0.784,0.0395841,0.852308,0.7882,0.0370505,0.825118,0.725,0.0208451,0.984427
Radial_l5,0.8136,0.0270691,0.710592,0.7854,0.0372728,0.838074,0.7874,0.0363451,0.812316,0.7254,0.032004,0.992427
Radial_l10,0.8154,0.0265129,0.695722,0.7866,0.038019,0.829418,0.7882,0.035337,0.811889,0.7246,0.0294811,0.99231
Radial_l20,0.8134,0.0269807,0.710375,0.7872,0.0352751,0.815907,0.7882,0.035237,0.814217,0.7258,0.0313394,0.992152
