# Benchmark NICE as calibration method

In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
import os
import sys
import time
import importlib
import collections
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.isotonic import IsotonicRegression
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input

pd.set_option('colheader_justify', 'center')

%aimport utils
%aimport utils.ops
%aimport utils.metrics
%aimport utils.visualization
%aimport utils.data
%aimport flows.nice
%aimport calibrators
from utils.ops import onehot_encode, optim_temperature, detection_log_likelihood_ratios
from utils.metrics import neg_log_likelihood, accuracy, expected_calibration_error
from utils.visualization import plot_pdf_simplex, plot_prob_simplex, reliability_plot, ECE_plot
from utils.data import get_cifar10
from flows.nice import NiceFlow
from calibrators import PAVCalibrator, NiceCalibrator, TempScalingCalibrator, MLRCalibrator, MatrixScalingCalibrator, DummyCalibrator

In [4]:
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

In [5]:
def highlight_min(s):
    is_min = s == s.min()
    return ['font-weight: bold' if v else '' for v in is_min]

## CIFAR-100

In [6]:
models = [
    'wide-resnet-28x10',
    'densenet-121',
    'densenet-169',
    'resnet-101',
    'vgg-19',
    'preactresnet-18',
    'preactresnet-164',
    'resnext-29_8x16',
    'wide-resnet-40x10',
]

In [7]:
def score(calibrator, logits, target):
    probs = calibrator.predict(logits)
    nll = neg_log_likelihood(probs, target)
    ece = expected_calibration_error(probs, target, bins=15)
    acc = accuracy(probs, target)
    
    metrics = {
        'NLL': nll,
        'ECE': ece,
        'Accuracy': acc,
    }
    
    return metrics

In [8]:
def train_calibrator(Calibrator, logits, target):
    cal = Calibrator(logits, target)
    return cal

In [9]:
def train_and_evaluate_calibrators(logits, target, test_logits, test_target, calibrators, **kwargs):
    
    
    ## Train NICE on test set to obtain NLLmin
    nice_ref_cal = NiceCalibrator(test_logits, test_target, **kwargs['nice_ref_args'])
    ref_results = score(nice_ref_cal, test_logits, test_target)
    
    results = collections.OrderedDict()
    for cal, Calibrator in calibrators.items():
        t0 = time.time()
        model = train_calibrator(Calibrator, logits, target)
        t1 = time.time() - t0
        results[cal] = {'Training time': t1,
                        'Validation': score(model, logits, target),
                        'Test': score(model, test_logits, test_target)}
    # Add NICE
    for nice in kwargs['nice_args']:
        t0 = time.time()
        nice_cal = NiceCalibrator(logits, target, **nice)
        t1 = time.time() - t0
        name = 'NICE_l{}_hs{}'.format(nice['layers'], nice['hidden_size'])
        results[name] = {'Training time': t1,
                                'Validation': score(nice_cal, logits, target),
                                'Test': score(nice_cal, test_logits, test_target)}
    
    return results, ref_results

In [10]:
results = collections.OrderedDict()
ref_results = collections.OrderedDict()

calibrators = {
    'Uncalibrated': DummyCalibrator,
    'Temp-Scaling': TempScalingCalibrator,
    'MLR': MLRCalibrator,
}

nice_args = [
    {
        'layers': 2,
        'hidden_size': [2],
        'epochs': 500,
    }, {
        'layers': 2,
        'hidden_size': [5],
        'epochs': 500,
    }, {
        'layers': 3,
        'hidden_size': [5],
        'epochs': 500,
    },
]

names = ['NICE_l{}_hs{}'.format(nice['layers'], nice['hidden_size']) for nice in nice_args]

nice_ref_args = {
    'layers': 4,
    'hidden_size': [100, 100],
    'epochs': 1000,
}

kwargs = {
    'nice_args': nice_args,
    'nice_ref_args': nice_ref_args,
}

for model in models:
    data_path = os.path.join('../data', model+'_cifar100')
    prefix = os.path.join(data_path, 'cifar100_'+model)
    
    logits = np.load(prefix + '_logit_prediction_valid.npy')
    test_logits = np.load(prefix + '_logit_prediction_test.npy')
    
    target = np.load(prefix + '_true_valid.npy')
    test_target = np.load(prefix + '_true_test.npy')

    results[model], ref_results[model] = train_and_evaluate_calibrators(logits, 
                                                                        target, 
                                                                        test_logits, 
                                                                        test_target, 
                                                                        calibrators,
                                                                        **kwargs)
    

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
ece_val_results = {}
ece_test_results = {}

acc_val_results = {}
acc_test_results = {}

nll_val_results = {}
nll_test_results = {}

for model, model_results in results.items():
    ece_val_results[model] = {}
    ece_test_results[model] = {}
    acc_val_results[model] = {}
    acc_test_results[model] = {}
    nll_val_results[model] = {}
    nll_test_results[model] = {}
    for cal, cal_results in model_results.items():
        ece_test_results[model][cal] = cal_results['Test']['ECE']
        ece_val_results[model][cal] = cal_results['Validation']['ECE']
        
        acc_test_results[model][cal] = cal_results['Test']['Accuracy']
        acc_val_results[model][cal] = cal_results['Validation']['Accuracy']
        
        nll_test_results[model][cal] = cal_results['Test']['NLL']
        nll_val_results[model][cal] = cal_results['Validation']['NLL']

**Results on the test set:**

In [41]:
df = pd.concat([pd.DataFrame.from_dict(acc_test_results, orient='columns'),
                pd.DataFrame.from_dict(ece_test_results, orient='columns')],
               axis=1,keys=['ACC','ECE']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[models].loc[list(calibrators.keys()) + names]
df.style.set_properties(**{'text-align': 'center'})\
    .format("{:.2%}")\
    .set_caption('CIFAR100 Test set')\
    .apply(highlight_max, subset=[(model, 'ACC') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'ECE') for model in df.columns.levels[0]])\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10,wide-resnet-28x10,densenet-121,densenet-121,densenet-169,densenet-169,resnet-101,resnet-101,vgg-19,vgg-19,preactresnet-18,preactresnet-18,preactresnet-164,preactresnet-164,resnext-29_8x16,resnext-29_8x16,wide-resnet-40x10,wide-resnet-40x10
Unnamed: 0_level_1,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE
Uncalibrated,80.39%,4.85%,78.80%,8.72%,79.05%,8.88%,72.00%,11.41%,72.70%,17.63%,76.60%,10.78%,73.28%,15.75%,77.88%,9.68%,76.74%,14.77%
Temp-Scaling,80.39%,4.28%,78.80%,3.52%,79.05%,3.77%,72.00%,1.51%,72.70%,4.81%,76.60%,3.10%,73.28%,2.07%,77.88%,2.82%,76.74%,3.77%
MLR,80.41%,4.23%,78.59%,3.56%,78.86%,3.82%,71.96%,2.13%,72.45%,3.94%,76.27%,3.36%,73.11%,1.92%,77.54%,2.74%,77.33%,3.99%
NICE_l2_hs[2],79.58%,4.18%,77.39%,8.39%,77.38%,8.72%,70.78%,11.57%,71.12%,13.80%,74.36%,11.45%,71.57%,16.43%,75.99%,10.69%,76.81%,12.21%
NICE_l2_hs[5],78.10%,4.23%,75.36%,8.95%,75.87%,9.03%,68.92%,12.58%,70.33%,10.66%,72.92%,11.72%,67.79%,19.43%,73.31%,12.42%,75.31%,12.62%
NICE_l3_hs[5],76.64%,6.26%,73.93%,10.72%,74.41%,9.88%,67.51%,13.34%,69.97%,9.37%,71.56%,13.32%,67.58%,19.49%,72.95%,12.63%,73.63%,13.38%


In [36]:
nll_df = pd.DataFrame.from_dict(nll_test_results, orient='columns')
min_nll_df = pd.DataFrame.from_dict(ref_results, orient='columns').loc['NLL']
nll_cal_df = nll_df.subtract(min_nll_df, axis='columns')

df = pd.concat([nll_df, nll_cal_df], axis=1,keys=['NLL','NLL_cal']).swaplevel(0,1,axis=1).sort_index(axis=1)
df = df[models].loc[list(calibrators.keys()) + names]

df.rename(columns={model: model + ' NLL_min={:.4f}'.format(min_nll_df[model]) 
                   for model in df.columns.levels[0]}, level=0, inplace=True)

df.style.set_properties(**{'text-align': 'center'})\
    .format("{:.4f}")\
    .set_caption('CIFAR100 NLL decomposition')\
    .apply(highlight_min)\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10 NLL_min=0.0139,wide-resnet-28x10 NLL_min=0.0139,densenet-121 NLL_min=0.0017,densenet-121 NLL_min=0.0017,densenet-169 NLL_min=0.0043,densenet-169 NLL_min=0.0043,resnet-101 NLL_min=0.2661,resnet-101 NLL_min=0.2661,vgg-19 NLL_min=0.0008,vgg-19 NLL_min=0.0008,preactresnet-18 NLL_min=0.0014,preactresnet-18 NLL_min=0.0014,preactresnet-164 NLL_min=0.0070,preactresnet-164 NLL_min=0.0070,resnext-29_8x16 NLL_min=0.0044,resnext-29_8x16 NLL_min=0.0044,wide-resnet-40x10 NLL_min=0.0011,wide-resnet-40x10 NLL_min=0.0011
Unnamed: 0_level_1,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal,NLL,NLL_cal
Uncalibrated,0.8173,0.8034,0.8937,0.892,0.8746,0.8704,1.1341,0.868,1.5404,1.5396,1.0573,1.056,1.3485,1.3415,0.9389,0.9345,1.221,1.2199
Temp-Scaling,0.8134,0.7995,0.8355,0.8338,0.8156,0.8113,1.0007,0.7345,1.1997,1.1989,0.9079,0.9066,0.9749,0.9679,0.822,0.8177,0.9055,0.9044
MLR,0.801,0.7871,0.8372,0.8355,0.8188,0.8145,1.0025,0.7364,1.209,1.2082,0.9151,0.9138,0.9818,0.9748,0.8243,0.8199,0.8691,0.868
NICE_l2_hs[2],0.8526,0.8387,0.9543,0.9527,0.9571,0.9528,1.2217,0.9556,1.4757,1.4749,1.1572,1.1559,1.4851,1.4781,1.0644,1.06,1.1428,1.1417
NICE_l2_hs[5],1.0087,0.9948,1.1374,1.1358,1.0625,1.0583,1.3617,1.0956,1.4635,1.4627,1.3547,1.3534,1.7696,1.7626,1.2489,1.2445,1.255,1.2539
NICE_l3_hs[5],1.1112,1.0973,1.2804,1.2787,1.2195,1.2152,1.4626,1.1965,1.4853,1.4845,1.4007,1.3993,1.8758,1.8688,1.3596,1.3552,1.3349,1.3338


**Results on the validation set:**

In [37]:
df = pd.concat([pd.DataFrame.from_dict(acc_val_results, orient='columns'),
                pd.DataFrame.from_dict(ece_val_results, orient='columns')],
               axis=1,keys=['ACC','ECE']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[models].loc[list(calibrators.keys()) + names]
df.style.set_properties(**{'text-align': 'center'})\
    .format("{:.2%}")\
    .set_caption('CIFAR100 Validation set')\
    .apply(highlight_max, subset=[(model, 'ACC') for model in df.columns.levels[0]])\
    .apply(highlight_min, subset=[(model, 'ECE') for model in df.columns.levels[0]])\
    .set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector="caption", props=[("text-align", "center"),
                                                      ("font-size", "200%"),
                                                      ("color", "black")])])

Unnamed: 0_level_0,wide-resnet-28x10,wide-resnet-28x10,densenet-121,densenet-121,densenet-169,densenet-169,resnet-101,resnet-101,vgg-19,vgg-19,preactresnet-18,preactresnet-18,preactresnet-164,preactresnet-164,resnext-29_8x16,resnext-29_8x16,wide-resnet-40x10,wide-resnet-40x10
Unnamed: 0_level_1,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE,ACC,ECE
Uncalibrated,79.94%,5.31%,78.26%,8.88%,78.68%,8.94%,72.18%,11.07%,71.62%,18.97%,75.62%,10.99%,73.36%,15.73%,77.50%,10.18%,76.60%,14.82%
Temp-Scaling,79.94%,4.85%,78.26%,4.18%,78.68%,3.40%,72.18%,2.66%,71.62%,5.41%,75.62%,3.03%,73.36%,1.80%,77.50%,2.92%,76.60%,3.73%
MLR,80.64%,4.37%,78.40%,3.96%,78.82%,3.71%,72.50%,2.08%,71.88%,4.84%,76.02%,2.99%,73.72%,1.90%,78.06%,2.80%,78.26%,3.95%
NICE_l2_hs[2],82.52%,3.31%,79.72%,5.23%,80.40%,5.01%,74.48%,7.07%,72.94%,11.69%,77.42%,7.12%,76.02%,11.07%,79.30%,6.60%,79.46%,8.49%
NICE_l2_hs[5],84.88%,2.91%,82.50%,2.10%,83.28%,2.14%,77.68%,2.79%,74.54%,5.98%,80.58%,3.01%,78.72%,6.55%,82.80%,2.46%,82.62%,3.93%
NICE_l3_hs[5],87.02%,3.81%,84.96%,1.93%,86.58%,2.73%,79.72%,1.42%,75.64%,3.42%,83.50%,2.16%,81.30%,3.72%,86.02%,1.68%,84.60%,1.84%
