# NICE for calibration: Comparison against other methods 

In [1]:
import os
import sys
sys.path.append('..')

import numpy as np
import tensorflow as tf
from scipy.special import softmax

from utils.data import get_cifar10, load_logits
from utils.ops import onehot_encode
from utils.metrics import neg_log_likelihood, expected_calibration_error
from calibrators import NiceCalibrator, TempScalingCalibrator

In [2]:
%matplotlib inline

### Load cifar10

In [3]:
cifar_dir = '../cifar-10'

In [4]:
cifar10, ix2label = get_cifar10(cifar_dir, test=True)

To compare the calibration capabilities of each method we are going to calibrate the model on a subset of the test set, and evaluate it on the other subset. Then compare results for each calibration method.

In [5]:
target = onehot_encode(cifar10['test_labels'])

# val/test split
val_target = target[:5000, :]
test_target = target[5000:, :]

### Load precomputed logits

In [6]:
model = 'resnet32_v1'
models_dir = '../pretrained-models'

In [7]:
_, logits = load_logits(os.path.join(models_dir, model))

# val/test split
val_logits = logits[:5000, :]
test_logits = logits[5000:, :]

## Calibrating the model

### Evaluating the uncalibrated model:
Negative Log-Likelihood and Expected Calibration Error on validation and test set

In [8]:
val_probs = softmax(val_logits, axis=1)
test_probs = softmax(test_logits, axis=1)

# Validation set
val_nll = neg_log_likelihood(val_probs, val_target)
val_ece = expected_calibration_error(val_probs, val_target, bins=15)

print('Negative log-likelihood of the uncalibrated model on the validation set: {:.3f}'.format(val_nll))
print('Expected calibration error of the uncalibrated model on the validation set: {:.5f}'.format(val_ece))

print('\n')

# Test set
test_nll = neg_log_likelihood(test_probs, test_target)
test_ece = expected_calibration_error(test_probs, test_target, bins=15)

print('Negative log-likelihood of the uncalibrated model on the test set: {:.3f}'.format(test_nll))
print('Expected calibration error of the uncalibrated model on the test set: {:.5f}'.format(test_ece))

Negative log-likelihood of the uncalibrated model on the validation set: 0.277
Expected calibration error of the uncalibrated model on the validation set: 0.00788


Negative log-likelihood of the uncalibrated model on the test set: 0.258
Expected calibration error of the uncalibrated model on the test set: 0.00711


### Temperature Scaling calibration:

In [9]:
temp_scaling_cal = TempScalingCalibrator(val_logits, val_target)
print("Calibrated using temperature T={:.3f}".format(temp_scaling_cal.T))

Calibrated using temperature T=1.765


Evaluating calibration:

In [10]:
val_probs_temp = temp_scaling_cal.predict(val_logits)
test_probs_temp = temp_scaling_cal.predict(test_logits)

# Validation set
val_nll_temp = neg_log_likelihood(val_probs_temp, val_target)
val_ece_temp = expected_calibration_error(val_probs_temp, val_target, bins=15)

print('Negative log-likelihood on the validation set after temp-scaling calibration: {:.3f}'.format(val_nll_temp))
print('Expected calibration error on the validation set after temp-scaling calibration: {:.5f}'.format(val_ece_temp))

print('\n')

# Test set
test_nll_temp = neg_log_likelihood(test_probs_temp, test_target)
test_ece_temp = expected_calibration_error(test_probs_temp, test_target, bins=15)

print('Negative log-likelihood on the test set after temp-scaling calibration: {:.3f}'.format(test_nll_temp))
print('Expected calibration error on the test set after temp-scaling calibration: {:.5f}'.format(test_ece_temp))

Negative log-likelihood on the validation set after temp-scaling calibration: 0.228
Expected calibration error on the validation set after temp-scaling calibration: 0.00130


Negative log-likelihood on the test set after temp-scaling calibration: 0.214
Expected calibration error on the test set after temp-scaling calibration: 0.00150


### NICE flow calibration:

In [11]:
nice_cal = NiceCalibrator(val_logits, val_target, layers=4, hidden_size=[10, 10])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Evaluating calibration:

In [12]:
val_probs_nice = nice_cal.predict(val_logits)
test_probs_nice = nice_cal.predict(test_logits)

# Validation set
val_nll_nice = neg_log_likelihood(val_probs_nice, val_target)
val_ece_nice = expected_calibration_error(val_probs_nice, val_target, bins=15)

print('Negative log-likelihood on the validation set after NICE calibration: {:.3f}'.format(val_nll_nice))
print('Expected calibration error on the validation set after NICE calibration: {:.5f}'.format(val_ece_nice))

print('\n')

# Test set
test_nll_nice = neg_log_likelihood(test_probs_nice, test_target)
test_ece_nice = expected_calibration_error(test_probs_nice, test_target, bins=15)

print('Negative log-likelihood on the test set after NICE calibration: {:.3f}'.format(test_nll_nice))
print('Expected calibration error on the test set after NICE calibration: {:.5f}'.format(test_ece_nice))

Negative log-likelihood on the validation set after NICE calibration: 0.092
Expected calibration error on the validation set after NICE calibration: 0.00306


Negative log-likelihood on the test set after NICE calibration: 0.477
Expected calibration error on the test set after NICE calibration: 0.01186
