# Benchmark NICE as calibration method

In [34]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
%autoreload 1

In [36]:
import os
import sys
import time
import importlib
import collections
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.isotonic import IsotonicRegression
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input

%aimport utils
%aimport utils.ops
%aimport utils.metrics
%aimport utils.visualization
%aimport utils.data
%aimport flows.nice
%aimport calibrators
from utils.ops import onehot_encode, optim_temperature, detection_log_likelihood_ratios
from utils.metrics import neg_log_likelihood, accuracy, expected_calibration_error
from utils.visualization import plot_pdf_simplex, plot_prob_simplex, reliability_plot, ECE_plot
from utils.data import get_cifar10
from flows.nice import NiceFlow
from calibrators import PAVCalibrator, NiceCalibrator, TempScalingCalibrator, MLRCalibrator, MatrixScalingCalibrator, DummyCalibrator

## CIFAR-100

In [37]:
models = [
    'wide-resnet-28x10',
    'densenet-121',
    'densenet-169',
    'resnet-101',
    'vgg-19',
    'preactresnet-18',
    'preactresnet-164',
    'resnext-29_8x16',
    'wide-resnet-40x10',
]

In [38]:
def score(calibrator, logits, target):
    probs = calibrator.predict(logits)
    nll = neg_log_likelihood(probs, target)
    ece = expected_calibration_error(probs, target, bins=15)
    acc = accuracy(probs, target)
    
    metrics = {
        'NLL': nll,
        'ECE': ece,
        'Accuracy': acc,
    }
    
    return metrics

In [39]:
def train_calibrator(Calibrator, logits, target):
    cal = Calibrator(logits, target)
    return cal

In [40]:
def train_and_evaluate_calibrators(logits, target, test_logits, test_target, calibrators):
    results = {}
    for cal, Calibrator in calibrators.items():
        t0 = time.time()
        model = train_calibrator(Calibrator, logits, target)
        t1 = time.time() - t0
        results[cal] = {'Training time': t1,
                        'Validation': score(model, logits, target),
                        'Test': score(model, test_logits, test_target)}
    # Add NICE
    t0 = time.time()
    nice_cal = NiceCalibrator(logits, target, layers=4, hidden_size=[100, 100], epochs=500)
    t1 = time.time() - t0
    results['NICE-flow'] = {'Training time': t1,
                            'Validation': score(nice_cal, logits, target),
                            'Test': score(nice_cal, test_logits, test_target)}
    
    return results

In [41]:
results = collections.OrderedDict()
calibrators = {'Uncalibrated': DummyCalibrator,
               'Temp-Scaling': TempScalingCalibrator,
               'MLR': MLRCalibrator,}

for model in models:
    data_path = os.path.join('../data', model+'_cifar100')
    prefix = os.path.join(data_path, 'cifar100_'+model)
    
    logits = np.load(prefix + '_logit_prediction_valid.npy')
    test_logits = np.load(prefix + '_logit_prediction_test.npy')
    
    target = np.load(prefix + '_true_valid.npy')
    test_target = np.load(prefix + '_true_test.npy')

    results[model] = train_and_evaluate_calibrators(logits, target, test_logits, test_target, calibrators)
    

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [57]:
ece_val_results = {}
ece_test_results = {}

acc_val_results = {}
acc_test_results = {}

nll_val_results = {}
nll_test_results = {}

for model, model_results in results.items():
    ece_val_results[model] = {}
    ece_test_results[model] = {}
    acc_val_results[model] = {}
    acc_test_results[model] = {}
    nll_val_results[model] = {}
    nll_test_results[model] = {}
    for cal, cal_results in model_results.items():
        ece_test_results[model][cal] = cal_results['Test']['ECE']
        ece_val_results[model][cal] = cal_results['Validation']['ECE']
        
        acc_test_results[model][cal] = cal_results['Test']['Accuracy']
        acc_val_results[model][cal] = cal_results['Validation']['Accuracy']
        
        nll_test_results[model][cal] = cal_results['Test']['NLL']
        nll_val_results[model][cal] = cal_results['Validation']['NLL']

Unnamed: 0,Uncalibrated,Temp-Scaling,MLR,NICE-flow
densenet-121,0.08724,0.035233,0.035649,0.188339
densenet-169,0.088833,0.037745,0.038213,0.205596
preactresnet-164,0.157535,0.020735,0.0192,0.327908
preactresnet-18,0.107804,0.030961,0.033565,0.214869
resnet-101,0.114132,0.015058,0.021318,0.262174
resnext-29_8x16,0.096784,0.028191,0.027438,0.15516
vgg-19,0.176313,0.048083,0.039412,0.289014
wide-resnet-28x10,0.04853,0.042847,0.042288,0.225491
wide-resnet-40x10,0.147669,0.037656,0.039919,0.245969


In [59]:
df = pd.concat([pd.DataFrame.from_dict(acc_test_results, orient='index'),
                pd.DataFrame.from_dict(ece_test_results, orient='index'),
                pd.DataFrame.from_dict(nll_test_results, orient='index')],
               axis=1,keys=['ACC','ECE', 'NLL']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[list(calibrators.keys()) + ['NICE-flow']]
df.loc[models]

Unnamed: 0_level_0,Uncalibrated,Uncalibrated,Uncalibrated,Temp-Scaling,Temp-Scaling,Temp-Scaling,MLR,MLR,MLR,NICE-flow,NICE-flow,NICE-flow
Unnamed: 0_level_1,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL
wide-resnet-28x10,0.8039,0.04853,0.817315,0.8039,0.042847,0.813448,0.8041,0.042288,0.801043,0.7369,0.225491,2.999516
densenet-121,0.788,0.08724,0.893708,0.788,0.035233,0.83548,0.7859,0.035649,0.837193,0.729,0.188339,1.951977
densenet-169,0.7905,0.088833,0.874644,0.7905,0.037745,0.815562,0.7886,0.038213,0.81876,0.7231,0.205596,2.161338
resnet-101,0.72,0.114132,1.1341,0.72,0.015058,1.000672,0.7196,0.021318,1.002505,0.6287,0.262174,2.674223
vgg-19,0.727,0.176313,1.540447,0.727,0.048083,1.199652,0.7245,0.039412,1.208966,0.6473,0.289014,3.622139
preactresnet-18,0.766,0.107804,1.057333,0.766,0.030961,0.90792,0.7627,0.033565,0.915122,0.6873,0.214869,2.179449
preactresnet-164,0.7328,0.157535,1.348495,0.7328,0.020735,0.974927,0.7311,0.0192,0.981766,0.6088,0.327908,4.106069
resnext-29_8x16,0.7788,0.096784,0.938889,0.7788,0.028191,0.822031,0.7754,0.027438,0.824304,0.7264,0.15516,1.44768
wide-resnet-40x10,0.7674,0.147669,1.220958,0.7674,0.037656,0.905459,0.7733,0.039919,0.869139,0.7122,0.245969,3.180873


In [60]:
df = pd.concat([pd.DataFrame.from_dict(acc_val_results, orient='index'),
                pd.DataFrame.from_dict(ece_val_results, orient='index'),
                pd.DataFrame.from_dict(nll_val_results, orient='index')],
               axis=1,keys=['ACC','ECE', 'NLL']).swaplevel(0,1,axis=1).sort_index(axis=1)

df = df[list(calibrators.keys()) + ['NICE-flow']]
df.loc[models]

Unnamed: 0_level_0,Uncalibrated,Uncalibrated,Uncalibrated,Temp-Scaling,Temp-Scaling,Temp-Scaling,MLR,MLR,MLR,NICE-flow,NICE-flow,NICE-flow
Unnamed: 0_level_1,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL,ACC,ECE,NLL
wide-resnet-28x10,0.7994,0.053124,0.819501,0.7994,0.048499,0.815698,0.8064,0.04374,0.778041,1.0,0.0,-1.16539e-07
densenet-121,0.7826,0.088754,0.940488,0.7826,0.041791,0.873531,0.784,0.039584,0.852308,1.0,0.00174,0.001746435
densenet-169,0.7868,0.089411,0.910888,0.7868,0.03401,0.846234,0.7882,0.037062,0.825119,1.0,0.000374,0.0003744004
resnet-101,0.7218,0.110705,1.134827,0.7218,0.026589,1.004311,0.725,0.020845,0.984427,0.9998,0.000728,0.004149777
vgg-19,0.7162,0.189706,1.5726,0.7162,0.054127,1.213497,0.7188,0.048368,1.195871,1.0,2.2e-05,2.192059e-05
preactresnet-18,0.7562,0.109858,1.089232,0.7562,0.030258,0.936548,0.7602,0.02989,0.920447,0.9996,0.001202,0.007675523
preactresnet-164,0.7336,0.157297,1.376296,0.7336,0.017966,0.991342,0.7372,0.01897,0.977262,0.9984,0.001546,0.02578883
resnext-29_8x16,0.775,0.101819,0.981964,0.775,0.029229,0.852484,0.7806,0.028018,0.829799,0.9932,0.027894,0.0530339
wide-resnet-40x10,0.766,0.148228,1.260772,0.766,0.037252,0.935867,0.7826,0.039497,0.867656,0.9996,0.0004,0.006447119
