## 1. Import Libraries

In [2]:
import os

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

import numpy as np
from easydict import EasyDict as eDict

import matplotlib.pyplot as plt

%matplotlib inline

from temperature_scaling import ModelWithTemperature

import warnings

warnings.filterwarnings("ignore")

## 2. Load validation features

In [16]:
# data = np.load("./data/coco_object_validation_feature_data.npz")
# data = np.load("./data/cub_validation_feature_data.npz")
# data = np.load("./data/image_net_validation_feature_data.npz")
# data = np.load("./data/tf_image_net_validation_feature_data.npz")
features = torch.Tensor(data["features"])
labels = torch.LongTensor(data["labels"])

In [17]:
class FeatureDataset(Dataset):
    def __init__(self, features, labels):
        super().__init__()
        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.size(0)

    def __getitem__(self, idx):
        feat = self.features[idx]
        label = self.labels[idx]

        return feat, label


class InceptionModel(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

## 3. Confidence Calibration 

In [18]:
val_dataset = FeatureDataset(features, labels)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

In [66]:
model = InceptionModel()
model.eval()
scaled_model = ModelWithTemperature(model, init_temp=0.23)
scaled_model.cuda()
scaled_model.set_temperature(val_loader)

Before temperature - NLL: 0.332, ECE: 0.255
Optimal temperature: 0.2188338041305542
After temperature - NLL: 0.0706715062, ECE: 0.0036738028


ModelWithTemperature(
  (model): InceptionModel()
)

## 4. Reliability Diagram

In [67]:
def get_output(model, val_loader):
    y_preds = torch.LongTensor([]).cuda()
    y_trues = torch.LongTensor([]).cuda()
    y_confs = torch.FloatTensor([]).cuda()

    for features, y_true in val_loader:
        features = features.cuda()
        y_true = y_true.cuda()

        y_prob = F.softmax(model(features), -1)
        y_conf, y_pred = torch.max(y_prob, 1)
        y_preds = torch.cat((y_preds, y_pred), 0)
        y_trues = torch.cat((y_trues, y_true), 0)
        y_confs = torch.cat((y_confs, y_conf), 0)

    y_confs = y_confs.data.cpu().numpy()
    y_preds = y_preds.data.cpu().numpy()
    y_trues = y_trues.data.cpu().numpy()

    return y_confs, y_preds, y_trues

In [68]:
def compute_acc_bin(conf_thresh_lower, conf_thresh_upper, conf, pred, true):
    """
    # Computes accuracy and average confidence for bin

    Args:
        conf_thresh_lower (float): Lower Threshold of confidence interval
        conf_thresh_upper (float): Upper Threshold of confidence interval
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels

    Returns:
        (accuracy, avg_conf, len_bin): accuracy of bin, confidence of bin and number of elements in bin.
    """
    filtered_tuples = [x for x in zip(pred, true, conf) if x[2] > conf_thresh_lower and x[2] <= conf_thresh_upper]
    if len(filtered_tuples) < 1:
        return 0, 0, 0
    else:
        correct = len([x for x in filtered_tuples if x[0] == x[1]])  # How many correct labels
        len_bin = len(filtered_tuples)  # How many elements falls into given bin
        avg_conf = sum([x[2] for x in filtered_tuples]) / len_bin  # Avg confidence of BIN
        accuracy = float(correct) / len_bin  # accuracy of BIN
        return accuracy, avg_conf, len_bin

In [69]:
def ECE(conf, pred, true, bin_size=0.1):

    """
    Expected Calibration Error

    Args:
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels
        bin_size: (float): size of one bin (0,1)  # TODO should convert to number of bins?

    Returns:
        ece: expected calibration error
    """

    upper_bounds = np.arange(bin_size, 1 + bin_size, bin_size)  # Get bounds of bins

    n = len(conf)
    ece = 0  # Starting error

    for conf_thresh in upper_bounds:  # Go through bounds and find accuracies and confidences
        acc, avg_conf, len_bin = compute_acc_bin(conf_thresh - bin_size, conf_thresh, conf, pred, true)
        ece += np.abs(acc - avg_conf) * len_bin / n  # Add weigthed difference to ECE

    return ece

In [70]:
def get_bin_info(conf, pred, true, bin_size=0.1):

    """
    Get accuracy, confidence and elements in bin information for all the bins.

    Args:
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels
        bin_size: (float): size of one bin (0,1)  # TODO should convert to number of bins?

    Returns:
        (acc, conf, len_bins): tuple containing all the necessary info for reliability diagrams.
    """

    upper_bounds = np.arange(bin_size, 1 + bin_size, bin_size)

    accuracies = []
    confidences = []
    bin_lengths = []

    for conf_thresh in upper_bounds:
        acc, avg_conf, len_bin = compute_acc_bin(conf_thresh - bin_size, conf_thresh, conf, pred, true)
        accuracies.append(acc)
        confidences.append(avg_conf)
        bin_lengths.append(len_bin)

    return accuracies, confidences, bin_lengths

In [71]:
def reliability_diagram(accs, confs, ece, M=15, name=""):
    plt.figure(figsize=(10, 10))
    ax = plt.axes()

    accs = np.array(accs)
    confs = np.array(confs)

    bin_size = 1 / M
    positions = np.arange(0 + bin_size / 2, 1 + bin_size / 2, bin_size)

    gap_below_diagonal = confs - accs
    gap_below_diagonal[gap_below_diagonal < 0] = 0

    gap_above_diagonal = accs - confs
    gap_above_diagonal[gap_above_diagonal < 0] = 0

    # Bars with outputs
    accs_plt = ax.bar(positions, accs, width=bin_size, edgecolor="black", color="#00A4CCFF", label="Outputs", zorder=2)
    confs_plt = ax.bar(
        positions, confs, width=bin_size, edgecolor="red", color="#F95700FF", alpha=0.0, linewidth=2, zorder=3
    )
    gap_above_diagonal_plt = ax.bar(
        positions,
        gap_above_diagonal,
        width=bin_size,
        edgecolor="red",
        color="red",
        label="Gap",
        alpha=0.3,
        linewidth=2,
        bottom=confs,
        zorder=3,
    )
    gap_below_diagonal_plt = ax.bar(
        positions,
        gap_below_diagonal,
        width=bin_size,
        edgecolor="red",
        color="red",
        alpha=0.3,
        linewidth=2,
        bottom=accs,
        zorder=3,
    )

    # Line plot with center line.
    ax.set_aspect("equal")
    ax.plot([0, 1], [0, 1], linestyle="--", linewidth=5)
    ax.legend(handles=[gap_above_diagonal_plt, accs_plt], prop={"size": 30})
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    plt.xticks(fontsize=23, rotation=90)
    plt.yticks(fontsize=23, rotation=0)
    props = dict(boxstyle="round", facecolor="wheat", alpha=0.8)
    ax.text(
        0.41,
        0.1,
        "ECE Error = {:2.2f}%".format(ece),
        transform=ax.transAxes,
        fontsize=30,
        verticalalignment="top",
        bbox=props,
    )
    ax.set_xlabel("Confidence", fontsize=30, color="black")
    ax.set_ylabel("Accuracy", fontsize=30, color="black")
    plt.savefig(f"{name}.pdf")

## 5. Results

In [72]:
M = 15
bin_size = 1 / M

### Before calibration

In [None]:
y_confs, y_preds, y_trues = get_output(model, val_loader)
ece = ECE(y_confs, y_preds, y_trues, bin_size) * 100
accs, confs, len_bins = get_bin_info(y_confs, y_preds, y_trues, bin_size=bin_size)
reliability_diagram(accs, confs, ece, M, "image_net_before_calibration")

### After calibration

In [None]:
y_confs, y_preds, y_trues = get_output(scaled_model, val_loader)
ece = ECE(y_confs, y_preds, y_trues, bin_size) * 100
accs, confs, len_bins = get_bin_info(y_confs, y_preds, y_trues, bin_size=bin_size)
reliability_diagram(accs, confs, ece, M, "image_net_after_calibration")