# Benchmark NICE as calibration method

In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [7]:
import os
import sys
import time
import importlib
import collections
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.isotonic import IsotonicRegression
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input

%aimport utils
%aimport utils.ops
%aimport utils.metrics
%aimport utils.visualization
%aimport utils.data
%aimport flows.nice
%aimport calibrators
from utils.ops import onehot_encode, optim_temperature, detection_log_likelihood_ratios
from utils.metrics import neg_log_likelihood, accuracy, expected_calibration_error
from utils.visualization import plot_pdf_simplex, plot_prob_simplex, reliability_plot, ECE_plot
from utils.data import get_cifar10
from flows.nice import NiceFlow
from calibrators import PAVCalibrator, NiceCalibrator, TempScalingCalibrator, MLRCalibrator, MatrixScalingCalibrator

## CIFAR-100

In [8]:
densenet_121_logits_val = np.load('../data/densenet-121_cifar100/cifar100_densenet-121_logit_prediction_valid.npy')
densenet_121_logits_test = np.load('../data/densenet-121_cifar100/cifar100_densenet-121_logit_prediction_test.npy')

densenet_121_target_val = np.load('../data/densenet-121_cifar100/cifar100_densenet-121_true_valid.npy')
densenet_121_target_test = np.load('../data/densenet-121_cifar100/cifar100_densenet-121_true_test.npy')

In [18]:
### Uncalibrated model
val_probs = softmax(densenet_121_logits_val, axis=1)
test_probs = softmax(densenet_121_logits_test, axis=1)


### Temp-Scaling
temp_scaling_cal = TempScalingCalibrator(densenet_121_logits_val, densenet_121_target_val)
print("Calibrated using temperature T={:.3f}\n\n".format(temp_scaling_cal.T))

val_probs_temp = temp_scaling_cal.predict(densenet_121_logits_val)
test_probs_temp = temp_scaling_cal.predict(densenet_121_logits_test)

### MAtrix-Scaling
mlr_scaling_cal = MLRCalibrator(densenet_121_logits_val, densenet_121_target_val)

val_probs_mlr = mlr_scaling_cal.predict(densenet_121_logits_val)
test_probs_mlr = mlr_scaling_cal.predict(densenet_121_logits_test)


### NICE-flow
t0 = time.time()
nice_cal = NiceCalibrator(densenet_121_logits_val, densenet_121_target_val, 
                          layers=4, hidden_size=[100, 100], epochs=5000)
print('NICE flow fitted in {:.3f}s'.format(time.time()-t0))

# Plot training NLL
plt.plot(nice_cal.history.history['loss'])
plt.title('NICE-flow NLL')
plt.ylabel('NLL')
plt.xlabel('Epoch')
plt.show()

val_probs_nice = nice_cal.predict(val_logits)
test_probs_nice = nice_cal.predict(test_logits)


Calibrated using temperature T=1.325


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


KeyboardInterrupt: 

In [None]:
## Uncalibrated model
# Validation set
val_nll = neg_log_likelihood(val_probs, densenet_121_target_val)
val_ece = expected_calibration_error(val_probs, densenet_121_target_val, bins=15)
val_acc = accuracy(val_probs, densenet_121_target_val)

# Test set
test_nll = neg_log_likelihood(test_probs, densenet_121_target_test)
test_ece = expected_calibration_error(test_probs, densenet_121_target_test, bins=15)
test_acc = accuracy(test_probs, densenet_121_target_test)

## Temp-Scaling
# Validation set
val_nll_temp = neg_log_likelihood(val_probs_temp, densenet_121_target_val)
val_ece_temp = expected_calibration_error(val_probs_temp, densenet_121_target_val, bins=15)
val_acc_temp = accuracy(val_probs_temp, densenet_121_target_val)

# Test set
test_nll_temp = neg_log_likelihood(test_probs_temp, densenet_121_target_test)
test_ece_temp = expected_calibration_error(test_probs_temp, densenet_121_target_test, bins=15)
test_acc_temp = accuracy(test_probs_temp, densenet_121_target_test)


## MLR
# Validation set
val_nll_mlr = neg_log_likelihood(val_probs_mlr, densenet_121_target_val)
val_ece_mlr = expected_calibration_error(val_probs_mlr, densenet_121_target_val, bins=15)
val_acc_mlr = accuracy(val_probs_mlr, densenet_121_target_val)

# Test set
test_nll_mlr = neg_log_likelihood(test_probs_mlr, densenet_121_target_test)
test_ece_mlr = expected_calibration_error(test_probs_mlr, densenet_121_target_test, bins=15)
test_acc_mlr = accuracy(test_probs_mlr, densenet_121_target_test)


## NICE
# Validation set
val_nll_nice = neg_log_likelihood(val_probs_nice, densenet_121_target_val)
val_ece_nice = expected_calibration_error(val_probs_nice, densenet_121_target_val, bins=15)
val_acc_nice = accuracy(val_probs_nice, densenet_121_target_val)

# Test set
test_nll_nice = neg_log_likelihood(test_probs_nice, densenet_121_target_test)
test_ece_nice = expected_calibration_error(test_probs_nice, densenet_121_target_test, bins=15)
test_acc_nice = accuracy(test_probs_nice, densenet_121_target_test)

In [None]:
def autolabel(rects, ax):
    """Attach a text label above each bar in *rects*, displaying its height.
    Taken from:
    'https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py'
    
    """
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{:.3f}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [None]:
fig, ax_arr = plt.subplots(1, 2, figsize=(17, 6))
fig.suptitle('Calibration of densenet-121 on CIFAR100', fontsize=16)

ind = np.arange(3)
width = 0.35

ticks = ['Uncalibrated', 'Temp-Scaling', 'MLR-Scaling', 'NICE-flow']


validation = 100. * np.array([val_acc, val_acc_temp, val_acc_mlr, val_acc_nice])
test = 100. * np.array([test_acc, test_acc_temp, test_acc_mlr, test_acc_nice])

rects1 = ax_arr[0].bar(ind, validation, width, color='b', label='Validation set')
rects2 = ax_arr[0].bar(ind+width, test, width, color='r', label='Test set')

ax_arr[0].set_ylabel('Accuracy')
ax_arr[0].set_title('Accuracy (%)')
ax_arr[0].set_xticks(ind + width / 2)
ax_arr[0].set_xticklabels(ticks)
ax_arr[0].set_ylim([0, max(validation.max(), test.max())*1.3])
ax_arr[0].legend()

autolabel(rects1, ax_arr[0])
autolabel(rects2, ax_arr[0])


validation = np.array([val_ece, val_ece_temp, val_ece_mlr, val_ece_nice])*100
test = np.array([test_ece, test_ece_temp, test_ece_mlr, test_ece_nice])*100

rects1 = ax_arr[1].bar(ind, validation, width, color='b', label='Validation set')
rects2 = ax_arr[1].bar(ind+width, test, width, color='r', label='Test set')

ax_arr[1].set_ylabel('ECE (%)')
ax_arr[1].set_title('Expected Calibration Error comparison')
ax_arr[1].set_xticks(ind + width / 2)
ax_arr[1].set_xticklabels(ticks)
ax_arr[1].set_ylim([0, max(validation.max(), test.max())*1.3])
ax_arr[1].legend()

autolabel(rects1, ax_arr[1])
autolabel(rects2, ax_arr[1])

plt.show()