In [None]:
#File for evaluating all models before and after temperature scaling
#Listed weights files are not included
import os

from tensorflow import keras
import pandas as pd
import sklearn.metrics as metrics
from keras.activations import softmax as keras_softmax
from sklearn.metrics import log_loss, brier_score_loss
os.environ['KERAS_BACKEND'] = 'tensorflow'
import tensorflow as tf
from keras.datasets import cifar10,cifar100
from keras.utils import np_utils
import keras.backend as K
import time
from keras import Input, Model
from keras import regularizers
from keras.layers import (add,
                          Conv2D, GlobalAveragePooling2D)
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Activation, BatchNormalization
from keras.preprocessing.image import ImageDataGenerator

In [2]:
def compute_acc_bin(conf_thresh_lower, conf_thresh_upper, conf, pred, true):
    """
    # Computes accuracy and average confidence for bin

    Args:
        conf_thresh_lower (float): Lower Threshold of confidence interval
        conf_thresh_upper (float): Upper Threshold of confidence interval
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels

    Returns:
        (accuracy, avg_conf, len_bin): accuracy of bin, confidence of bin and number of elements in bin.
    """
    filtered_tuples = [x for x in zip(pred, true, conf) if x[2] > conf_thresh_lower and x[2] <= conf_thresh_upper]
    if len(filtered_tuples) < 1:
        return 0, 0, 0
    else:
        correct = len([x for x in filtered_tuples if x[0] == x[1]])  # How many correct labels
        len_bin = len(filtered_tuples)  # How many elements falls into given bin
        avg_conf = sum([x[2] for x in filtered_tuples]) / len_bin  # Avg confidence of BIN
        accuracy = float(correct) / len_bin  # accuracy of BIN
        return accuracy, avg_conf, len_bin


def ECE(conf, pred, true, bin_size=0.1):
    """
    Expected Calibration Error

    Args:
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels
        bin_size: (float): size of one bin (0,1)  # TODO should convert to number of bins?

    Returns:
        ece: expected calibration error
    """

    upper_bounds = np.arange(bin_size, 1 + bin_size, bin_size)  # Get bounds of bins

    n = len(conf)
    ece = 0  # Starting error

    for conf_thresh in upper_bounds:  # Go through bounds and find accuracies and confidences
        acc, avg_conf, len_bin = compute_acc_bin(conf_thresh - bin_size, conf_thresh, conf, pred, true)
        ece += np.abs(acc - avg_conf) * len_bin / n  # Add weigthed difference to ECE

    return ece


def MCE(conf, pred, true, bin_size=0.1):
    """
    Maximal Calibration Error

    Args:
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels
        bin_size: (float): size of one bin (0,1)  # TODO should convert to number of bins?

    Returns:
        mce: maximum calibration error
    """

    upper_bounds = np.arange(bin_size, 1 + bin_size, bin_size)

    cal_errors = []

    for conf_thresh in upper_bounds:
        acc, avg_conf, _ = compute_acc_bin(conf_thresh - bin_size, conf_thresh, conf, pred, true)
        cal_errors.append(np.abs(acc - avg_conf))

    return max(cal_errors)


def get_bin_info(conf, pred, true, bin_size=0.1):
    """
    Get accuracy, confidence and elements in bin information for all the bins.

    Args:
        conf (numpy.ndarray): list of confidences
        pred (numpy.ndarray): list of predictions
        true (numpy.ndarray): list of true labels
        bin_size: (float): size of one bin (0,1)  # TODO should convert to number of bins?

    Returns:
        (acc, conf, len_bins): tuple containing all the necessary info for reliability diagrams.
    """

    upper_bounds = np.arange(bin_size, 1 + bin_size, bin_size)

    accuracies = []
    confidences = []
    bin_lengths = []

    for conf_thresh in upper_bounds:
        acc, avg_conf, len_bin = compute_acc_bin(conf_thresh - bin_size, conf_thresh, conf, pred, true)
        accuracies.append(acc)
        confidences.append(avg_conf)
        bin_lengths.append(len_bin)

    return accuracies, confidences, bin_lengths

In [19]:
from scipy.optimize import minimize
class TemperatureScaling():

    def __init__(self, temp=1, maxiter=50, solver="BFGS"):
        """
        Initialize class
        
        Params:
            temp (float): starting temperature, default 1
            maxiter (int): maximum iterations done by optimizer, however 8 iterations have been maximum.
        """
        self.temp = temp
        self.maxiter = maxiter
        self.solver = solver

    def _loss_fun(self, x, probs, true):
        # Calculates the loss using log-loss (cross-entropy loss)
        scaled_probs = self.predict(probs, x)
        loss = log_loss(y_true=true, y_pred=scaled_probs)
        return loss

    # Find the temperature
    def fit(self, logits, true):
        """
        Trains the model and finds optimal temperature
        
        Params:
            logits: the output from neural network for each class (shape [samples, classes])
            true: one-hot-encoding of true labels.
            
        Returns:
            the results of optimizer after minimizing is finished.
        """

        true = true.flatten()  # Flatten y_val
        opt = minimize(self._loss_fun, x0=1, args=(logits, true), options={'maxiter': self.maxiter}, method=self.solver)
        self.temp = opt.x[0]

        return opt

    def predict(self, logits, temp=None):
        """
        Scales logits based on the temperature and returns calibrated probabilities
        
        Params:
            logits: logits values of data (output from neural network) for each class (shape [samples, classes])
            temp: if not set use temperatures find by model or previously set.
            
        Returns:
            calibrated probabilities (nd.array with shape [samples, classes])
        """

        if not temp:
            logits_tensor = tf.convert_to_tensor(logits/self.temp)  # Convert NumPy array to TensorFlow tensor
            return keras_softmax(logits_tensor, axis=-1)
        else:
            logits_tensor = tf.convert_to_tensor(logits/temp)
            return keras_softmax(logits_tensor, axis=-1)

In [3]:
import csv

def write_metrics_to_csv(filename, error, ece, mce, log_loss, brier):
    with open(filename, 'a', newline='') as csvfile:
        fieldnames = ['error', 'ece', 'mce', 'log_loss', 'brier']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write header only if the file is empty
        if csvfile.tell() == 0:
            writer.writeheader()

        writer.writerow({
            'error': error,
            'ece': ece,
            'mce': mce,
            'log_loss': log_loss,
            'brier': brier
        })
def write_mean_and_std_to_csv(filename, model_name, value_names, mean, std_dev):
    with open(filename, 'a', newline='') as csvfile:
        fieldnames = ['model_name'] + [f'{name}_mean' for name in value_names] + [f'{name}_std_dev' for name in value_names]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write header only if the file is empty
        if csvfile.tell() == 0:
            writer.writeheader()

        row_dict = {'model_name': model_name}
        row_dict.update({f'{name}_mean': mean_value for name, mean_value in zip(value_names, mean)})
        row_dict.update({f'{name}_std_dev': std_dev_value for name, std_dev_value in zip(value_names, std_dev)})
        writer.writerow(row_dict)

In [4]:
def evaluate(probs, y_true, verbose=False, normalize=False, bins=15):
    """
    Evaluate model using various scoring measures: Error Rate, ECE, MCE, NLL, Brier Score
    
    Params:
        probs: a list containing probabilities for all the classes with a shape of (samples, classes)
        y_true: a list containing the actual class labels
        verbose: (bool) are the scores printed out. (default = False)
        normalize: (bool) in case of 1-vs-K calibration, the probabilities need to be normalized.
        bins: (int) - into how many bins are probabilities divided (default = 15)
        
    Returns:
        (error, ece, mce, loss, brier), returns various scoring measures
    """
    probs = probs.numpy()
    preds = np.argmax(probs, axis=1)  # Take maximum confidence as prediction

    if normalize:
        confs = np.max(probs, axis=1) / np.sum(probs, axis=1)
        # Check if everything below or equal to 1?
    else:
        confs = np.max(probs, axis=1)  # Take only maximum confidence

    if len(y_true.shape) > 1 and y_true.shape[1] > 1:  # If 1-hot representation, get back to numeric
      y_true = np.array([[np.where(r == 1)[0][0]] for r in y_true])  # Back to np array also

    accuracy = metrics.accuracy_score(y_true, preds) * 100
    error = 100 - accuracy

    # Calculate ECE
    ece = ECE(confs, preds, y_true, bin_size=1 / bins)
    # Calculate MCE
    mce = MCE(confs, preds, y_true, bin_size=1 / bins)
    loss = log_loss(y_true=y_true, y_pred=probs)
    # Calculate Brier score for each class
    y_prob_true = np.array([probs[i, idx] for i, idx in enumerate(y_true)])
    for i in range(len(y_true)):
      y_true[i] = 1
    brier = brier_score_loss(y_true=y_true, y_prob=y_prob_true)  # Brier Score (MSE)

    return (error, ece, mce, loss, brier)

In [5]:
def cal_results(fn,name, logits_val,logits_test,y_val,y_test, m_kwargs={}, approach="all"):
    """
    Calibrate models scores, using output from logits files and given function (fn). 
    There are implemented to different approaches "all" and "1-vs-K" for calibration,
    the approach of calibration should match with function used for calibration.
    
    TODO: split calibration of single and all into separate functions for more use cases.
    
    Params:
        fn (class): class of the calibration method used. It must contain methods "fit" and "predict", 
                    where first fits the models and second outputs calibrated probabilities.
        path (string): path to the folder with logits files
        files (list of strings): pickled logits files ((logits_val, y_val), (logits_test, y_test))
        m_kwargs (dictionary): keyword arguments for the calibration class initialization
        approach (string): "all" for multiclass calibration and "1-vs-K" for 1-vs-K approach.
        
    Returns:
        df (pandas.DataFrame): dataframe with calibrated and uncalibrated results for all the input files.
    
    """

    df = pd.DataFrame(columns=["Name", "Error", "ECE", "MCE", "Loss", "Brier"])

    total_t1 = time.time()

    if approach == "all":
        if y_val.shape[1] > 1:  # If 1-hot representation, get back to numeric
          y_val = np.array([[np.where(r == 1)[0][0]] for r in y_val])  # Back to np array also

        y_val = y_val.flatten()

        model = fn(**m_kwargs)

        model.fit(logits_val, y_val)

        probs_val = model.predict(logits_val)
        probs_test = model.predict(logits_test)
        logits_tensor = tf.convert_to_tensor(logits_test)  # Convert NumPy array to TensorFlow tensor
        error, ece, mce, loss, brier = evaluate(keras_softmax(logits_tensor, axis=-1), y_test, verbose=True)  # Test before scaling
        error2, ece2, mce2, loss2, brier2 = evaluate(probs_test, y_test, verbose=False)

        print("Test: Error %f; ece %f; mce %f; loss %f, brier %f" % (error2, ece2, mce2, loss2, brier2))
        print("Val: Error %f; ece %f; mce %f; loss %f, brier %f" % evaluate(probs_val, y_val, verbose=False,
                                                                        normalize=True))


    else:  # 1-vs-k models
        probs_val = keras_softmax(logits_val)  # Softmax logits
        probs_test = keras_softmax(logits_test)
        probs_test1 = keras_softmax(logits_test)
        K = probs_test.shape[1]

        # Go through all the classes
        for k in range(K):
            # Prep class labels (1 fixed true class, 0 other classes)
            y_cal = np.array(y_val == k, dtype="int")[:, 0]

            # Train model
            model = fn(**m_kwargs)
            model.fit(probs_val[:, k], y_cal)  # Get only one column with probs for given class "k"

            probs_val[:, k] = model.predict(probs_val[:, k])  # Predict new values based on the fittting
            probs_test[:, k] = model.predict(probs_test[:, k])

            # Replace NaN with 0, as it should be close to zero  # TODO is it needed?
            idx_nan = np.where(np.isnan(probs_test))
            probs_test[idx_nan] = 0

            idx_nan = np.where(np.isnan(probs_val))
            probs_val[idx_nan] = 0

        # Get results for test set
        error, ece, mce, loss, brier = evaluate(probs_test1, y_test, verbose=True, normalize=False)
        error2, ece2, mce2, loss2, brier2 = evaluate(probs_test, y_test, verbose=False, normalize=True)

        print("Test: Error %f; ece %f; mce %f; loss %f, brier %f" % (error2, ece2, mce2, loss2, brier2))
        print("Val: Error %f; ece %f; mce %f; loss %f, brier %f" % evaluate(probs_val, y_val, verbose=False,
                                                                        normalize=True))

    return (error, ece, mce, loss, brier),(error2, ece2, mce2, loss2, brier2)

In [6]:
#Method for applying temperature scaling to the model
def temp_scaling(model,weights_file,name,x_val,x_test,y_val,y_test):
    last_layer = model.layers.pop()
    last_layer.activation = keras.activations.linear
    i = model.input
    o = last_layer(model.layers[-2].output)

    model = keras.models.Model(inputs=i, outputs=[o])

    # First load in the weights
    model.load_weights(weights_file)
    model.compile(optimizer="sgd", loss="categorical_crossentropy")
    # Next get predictions
    logits_val = model.predict(x_val, verbose=1)
    logits_test = model.predict(x_test, verbose=1)
    (error, ece, mce, loss, brier),(error2, ece2, mce2, loss2, brier2) = cal_results(TemperatureScaling,name,logits_val,logits_test,y_val,y_test, approach = "all")
    write_metrics_to_csv('uncal.csv', error, ece, mce, loss, brier)
    write_metrics_to_csv('temp.csv', error2, ece2, mce2, loss2, brier2)

    return error, ece, mce, loss, brier

In [7]:
import densenet
import wrn
import resnet_sd

In [20]:
stack_n = 18
num_classes10 = 10
num_classes100 = 100
img_rows, img_cols = 32, 32
img_channels = 3
batch_size = 128
epochs = 200
iterations = 45000 // batch_size
weight_decay = 0.0001
mean = [125.307, 122.95, 113.865]  # Mean (per-pixel mean?) - let it be atm
std = [62.9932, 62.0887, 66.7048]
seed = 333

def residual_network(img_input, classes_num=10, stack_n=5):
    def residual_block(intput, out_channel, increase=False):
        if increase:
            stride = (2, 2)
        else:
            stride = (1, 1)

        pre_bn = BatchNormalization()(intput)
        pre_relu = Activation('relu')(pre_bn)

        conv_1 = Conv2D(out_channel, kernel_size=(3, 3), strides=stride, padding='same',
                        kernel_initializer="he_normal",
                        kernel_regularizer=regularizers.l2(weight_decay))(pre_relu)
        bn_1 = BatchNormalization()(conv_1)
        relu1 = Activation('relu')(bn_1)
        conv_2 = Conv2D(out_channel, kernel_size=(3, 3), strides=(1, 1), padding='same',
                        kernel_initializer="he_normal",
                        kernel_regularizer=regularizers.l2(weight_decay))(relu1)
        if increase:
            projection = Conv2D(out_channel,
                                kernel_size=(1, 1),
                                strides=(2, 2),
                                padding='same',
                                kernel_initializer="he_normal",
                                kernel_regularizer=regularizers.l2(weight_decay))(intput)
            block = add([conv_2, projection])
        else:
            block = add([intput, conv_2])
        return block

    # build model
    # total layers = stack_n * 3 * 2 + 2
    # stack_n = 5 by default, total layers = 32
    # input: 32x32x3 output: 32x32x16
    x = Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), padding='same',
               kernel_initializer="he_normal",
               kernel_regularizer=regularizers.l2(weight_decay))(img_input)

    # input: 32x32x16 output: 32x32x16
    for _ in range(stack_n):
        x = residual_block(x, 16, False)

    # input: 32x32x16 output: 16x16x32
    x = residual_block(x, 32, True)
    for _ in range(1, stack_n):
        x = residual_block(x, 32, False)

    # input: 16x16x32 output: 8x8x64
    x = residual_block(x, 64, True)
    for _ in range(1, stack_n):
        x = residual_block(x, 64, False)

    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = GlobalAveragePooling2D()(x)

    # input: 64 output: 10
    x = Dense(classes_num, activation='softmax',
              kernel_initializer="he_normal",
              kernel_regularizer=regularizers.l2(weight_decay))(x)
    return x

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
y_test = keras.utils.to_categorical(y_test, num_classes10)

# color preprocessing - using precalculated means and std-s
x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1,
                                                      random_state=seed)  # random_state = seed

img_mean = x_train45.mean(axis=0)  # per-pixel mean
img_std = x_train45.std(axis=0)
x_train45 = (x_train45 - img_mean) / img_std
x_val = (x_val - img_mean) / img_std
x_test = (x_test - img_mean) / img_std

img_input = Input(shape=(img_rows, img_cols, img_channels))
output = residual_network(img_input, num_classes10, stack_n)
model = Model(img_input, output)

temp_scaling(model,'resnet_cifar10.h5','resnet_cifar10',x_val,x_test,y_val,y_test)
temp_scaling(model,'resnet_cifar10_2.h5','resnet_cifar10_2',x_val,x_test,y_val,y_test)

Test: Error 7.150000; ece 0.012143; mce 0.808834; loss 0.225950, brier 0.061908
Val: Error 6.060000; ece 0.006799; mce 0.249597; loss 0.193776, brier 0.053334
Test: Error 6.930000; ece 0.007459; mce 0.236941; loss 0.210127, brier 0.059982
Val: Error 6.320000; ece 0.009264; mce 0.261208; loss 0.204094, brier 0.055949


(6.930000000000007,
 0.04833229484856122,
 0.32229046523571014,
 0.32020646658444546,
 0.06022057435291904)

In [21]:
temp_scaling(model,'model_resnet_c10_best.hdf5','resnet_cifar10_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_resnet_c10_best_2.hdf5','resnet_cifar10_2_best',x_val,x_test,y_val,y_test)

Test: Error 7.170000; ece 0.011899; mce 0.093680; loss 0.224852, brier 0.062412
Val: Error 5.980000; ece 0.008525; mce 0.264836; loss 0.189279, brier 0.052737
Test: Error 7.000000; ece 0.006109; mce 0.231387; loss 0.207924, brier 0.060495
Val: Error 6.300000; ece 0.010049; mce 0.238677; loss 0.202768, brier 0.056649


(7.0,
 0.046890796139836356,
 0.34358163606161357,
 0.3065435069459051,
 0.06024696970467692)

In [25]:
stack_n = 18
num_classes = 100
img_rows, img_cols = 32, 32
img_channels = 3
batch_size = 128
epochs = 200
iterations = 45000 // batch_size
weight_decay = 0.0001
seed = 333
(x_train, y_train), (x_test, y_test) = cifar100.load_data()
y_test = keras.utils.to_categorical(y_test, num_classes100)

x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1,
                                                      random_state=seed)  # random_state = seed

img_mean = x_train45.mean(axis=0)  # per-pixel mean
img_std = x_train45.std(axis=0)
x_train45 = (x_train45 - img_mean) / img_std
x_val = (x_val - img_mean) / img_std
x_test = (x_test - img_mean) / img_std
img_input = Input(shape=(img_rows, img_cols, img_channels))
output = residual_network(img_input, num_classes, stack_n)
model = Model(img_input, output)

temp_scaling(model,'resnet_cifar100.h5','resnet_cifar100',x_val,x_test,y_val,y_test)
temp_scaling(model,'resnet_cifar100_2.h5','resnet_cifar100_2',x_val,x_test,y_val,y_test)

Test: Error 29.980000; ece 0.017710; mce 0.062538; loss 1.138919, brier 0.304810
Val: Error 28.500000; ece 0.023259; mce 0.121306; loss 1.098821, brier 0.293941
Test: Error 30.320000; ece 0.025545; mce 0.061441; loss 1.164974, brier 0.308856
Val: Error 29.300000; ece 0.022722; mce 0.075312; loss 1.103952, brier 0.299339


(30.320000000000007,
 0.18645093108713628,
 0.3648860052113643,
 1.7314717884921924,
 0.280367236719004)

In [26]:
temp_scaling(model,'model_resnet_c100_best.hdf5','resnet_cifar100_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_resnet_c100_best_2.hdf5','resnet_cifar100_2_best',x_val,x_test,y_val,y_test)

Test: Error 29.910000; ece 0.016236; mce 0.051957; loss 1.062137, brier 0.296532
Val: Error 28.220000; ece 0.019273; mce 0.080173; loss 1.022614, brier 0.285813
Test: Error 30.310000; ece 0.017048; mce 0.044367; loss 1.079424, brier 0.298648
Val: Error 29.500000; ece 0.015818; mce 0.071135; loss 1.033385, brier 0.292662


(30.310000000000002,
 0.1237496825732291,
 0.22596183901061562,
 1.2479673907868696,
 0.27430659409144625)

In [28]:
batch_size = 64
nb_classes10 = 10
nb_classes100 = 100

nb_epoch = 300

img_rows, img_cols = 32, 32
img_channels = 3

img_dim = (img_channels, img_rows, img_cols) if K.image_data_format() == 'channels_first' else (img_rows, img_cols, img_channels)
depth = 40
nb_dense_block = 3
growth_rate = 12
nb_filter = -1
dropout_rate = 0.0 # 0.0 for data augmentation
seed = 333
weight_decay = 0.0001
learning_rate = 0.1

def color_preprocessing(x_train,x_test):
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    mean = [125.307, 122.95, 113.865]
    std  = [62.9932, 62.0887, 66.7048]
    for i in range(3):
        x_train[:,:,:,i] = (x_train[:,:,:,i] - mean[i]) / std[i]
        x_test[:,:,:,i] = (x_test[:,:,:,i] - mean[i]) / std[i]

    return x_train, x_test

model = densenet.DenseNet(img_dim, classes=nb_classes10, depth=depth, nb_dense_block=nb_dense_block,
                          growth_rate=growth_rate, nb_filter=nb_filter, dropout_rate=dropout_rate, weights=None, weight_decay=1e-4)

                          
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
#For data preprocessing, we normalize the data using the channel means and standard deviations (https://arxiv.org/pdf/1608.06993v3.pdf)
x_train, x_test = color_preprocessing(x_train, x_test)


x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=seed)  # random_state = seed


y_train45 = np_utils.to_categorical(y_train45, nb_classes10)  # 1-hot vector
y_val = np_utils.to_categorical(y_val, nb_classes10)
y_test = np_utils.to_categorical(y_test, nb_classes10)

temp_scaling(model,'dense_cifar10.h5','dense_cifar10',x_val,x_test,y_val,y_test)
temp_scaling(model,'dense_cifar10_2.h5','dense_cifar10_2',x_val,x_test,y_val,y_test)

Test: Error 7.570000; ece 0.009323; mce 0.355586; loss 0.224925, brier 0.064755
Val: Error 6.400000; ece 0.013225; mce 0.307475; loss 0.195476, brier 0.056430
Test: Error 7.990000; ece 0.006296; mce 0.254074; loss 0.239117, brier 0.068429
Val: Error 6.940000; ece 0.006648; mce 0.246471; loss 0.210352, brier 0.060541


(7.989999999999995,
 0.05804203577041622,
 0.3539295910048658,
 0.4450498906669829,
 0.06945803569195254)

In [30]:
temp_scaling(model,'model_dense_c10_best.hdf5','dense_cifar10_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_dense_c10_best_2.hdf5','dense_cifar10_2_best',x_val,x_test,y_val,y_test)

Test: Error 7.490000; ece 0.008717; mce 0.257427; loss 0.225905, brier 0.064981
Val: Error 6.360000; ece 0.007015; mce 0.239123; loss 0.194102, brier 0.055814
Test: Error 7.370000; ece 0.006262; mce 0.246989; loss 0.218735, brier 0.062979
Val: Error 6.520000; ece 0.003578; mce 0.415141; loss 0.192373, brier 0.055955


(7.3700000000000045,
 0.05018369305431836,
 0.3286578632415609,
 0.35682083705536327,
 0.06372033332182321)

In [31]:
temp_scaling(model,'model_dense_c10_best_2_cont.hdf5','dense_cifar10_2_best_cont',x_val,x_test,y_val,y_test)

Test: Error 8.050000; ece 0.007991; mce 0.755234; loss 0.244564, brier 0.070114
Val: Error 7.120000; ece 0.006689; mce 0.104014; loss 0.214130, brier 0.061756


(8.049999999999997,
 0.053611510443687366,
 0.28426332821448647,
 0.38701807015427775,
 0.06993229177593825)

In [35]:
batch_size = 64
nb_classes10 = 10
nb_classes100 = 100

nb_epoch = 300

img_rows, img_cols = 32, 32
img_channels = 3

img_dim = (img_channels, img_rows, img_cols) if K.image_data_format() == 'channels_first' else (img_rows, img_cols, img_channels)
depth = 40
nb_dense_block = 3
growth_rate = 12
nb_filter = 12
dropout_rate = 0.0 # 0.0 for data augmentation
seed = 333
weight_decay = 0.0001
learning_rate = 0.1

model = densenet.DenseNet(img_dim, classes=nb_classes100, depth=depth, nb_dense_block=nb_dense_block,
                          growth_rate=growth_rate, nb_filter=nb_filter, dropout_rate=dropout_rate, weights=None, weight_decay=1e-4)

                          
(x_train, y_train), (x_test, y_test) = cifar100.load_data()
#For data preprocessing, we normalize the data using the channel means and standard deviations (https://arxiv.org/pdf/1608.06993v3.pdf)
x_train, x_test = color_preprocessing(x_train, x_test)

x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=seed)  # random_state = seed

y_train45 = np_utils.to_categorical(y_train45, nb_classes100)  # 1-hot vector
y_val = np_utils.to_categorical(y_val, nb_classes100)
y_test = np_utils.to_categorical(y_test, nb_classes100)

temp_scaling(model,'dense_cifar100.h5','dense_cifar100',x_val,x_test,y_val,y_test)
temp_scaling(model,'dense_cifar100_2.h5','dense_cifar100_2',x_val,x_test,y_val,y_test)

Test: Error 30.020000; ece 0.009139; mce 0.101549; loss 1.070529, brier 0.297455
Val: Error 28.920000; ece 0.013151; mce 0.063356; loss 1.057021, brier 0.294576
Test: Error 29.870000; ece 0.013654; mce 0.047472; loss 1.062526, brier 0.297955
Val: Error 28.680000; ece 0.017694; mce 0.080801; loss 1.045124, brier 0.289957


(29.86999999999999,
 0.21113098706901073,
 0.4941648625272899,
 1.9960649609469205,
 0.2746932379026984)

In [39]:
temp_scaling(model,'model_dense_c100_best.hdf5','dense_cifar100_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_dense_c100_best_2.hdf5','dense_cifar100_2_best',x_val,x_test,y_val,y_test)

Test: Error 41.000000; ece 0.013354; mce 0.038356; loss 1.464651, brier 0.410153
Val: Error 39.460000; ece 0.022350; mce 0.074995; loss 1.426372, brier 0.402228
Test: Error 36.880000; ece 0.010857; mce 0.029601; loss 1.318632, brier 0.373211
Val: Error 36.840000; ece 0.014180; mce 0.074884; loss 1.309039, brier 0.372017


(36.88,
 0.1185962333671749,
 0.23064466863870625,
 1.4556572277420063,
 0.3414685977690999)

In [40]:
img_rows, img_cols = 32, 32
img_channels = 3
nb_epochs = 500
batch_size = 128
nb_classes = 100
seed = 333

def color_preprocessing(x_train, x_val, x_test):
    
    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')    
    x_test = x_test.astype('float32')
    
    mean = np.mean(x_train, axis=(0,1,2))  # Per channel mean
    std = np.std(x_train, axis=(0,1,2))
    x_train = (x_train - mean) / std
    x_val = (x_val - mean) / std
    x_test = (x_test - mean) / std
    
    return x_train, x_val, x_test    
    
# data
(x_train, y_train), (x_test, y_test) = cifar100.load_data()

# Data splitting (get additional 5k validation set)
# Sklearn to split
x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=seed)  # random_state = seed
x_train45, x_val, x_test = color_preprocessing(x_train45, x_val, x_test)  # Mean per channel    

y_train45 = np_utils.to_categorical(y_train45, nb_classes)  # 1-hot vector
y_val = np_utils.to_categorical(y_val, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)

    
# building and training net
model = resnet_sd.resnet_sd_model(img_shape = (32,32), img_channels = 3, 
                        layers = 110, nb_classes = nb_classes, verbose = False)

temp_scaling(model,'resnet_sd_cifar100.h5','resnet_sd_cifar100',x_val,x_test,y_val,y_test)
temp_scaling(model,'resnet_sd_cifar100_2.h5','resnet_sd_cifar100_2',x_val,x_test,y_val,y_test)

Test: Error 24.890000; ece 0.011509; mce 0.046618; loss 0.874497, brier 0.247391
Val: Error 25.000000; ece 0.012717; mce 0.067875; loss 0.875597, brier 0.246840
Test: Error 25.540000; ece 0.009618; mce 0.060505; loss 0.892248, brier 0.252746
Val: Error 25.080000; ece 0.014532; mce 0.135358; loss 0.886396, brier 0.248326


(25.539999999999992,
 0.1425576159015298,
 0.30673319780698394,
 1.2519651670233336,
 0.22999444853856102)

In [41]:
temp_scaling(model,'model_resnet110SD_c100_best.hdf5','resnet_sd_cifar100_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_resnet110SD_c100_best_2.hdf5','resnet_sd_cifar100_2_best',x_val,x_test,y_val,y_test)

Test: Error 25.400000; ece 0.015537; mce 0.057541; loss 0.891047, brier 0.253329
Val: Error 25.720000; ece 0.009878; mce 0.091173; loss 0.895839, brier 0.253629
Test: Error 26.280000; ece 0.009767; mce 0.074996; loss 0.910788, brier 0.257595
Val: Error 25.340000; ece 0.016301; mce 0.062810; loss 0.896131, brier 0.252719


(26.28,
 0.13251221177279948,
 0.3038850770138277,
 1.1787124481166609,
 0.23442392355329608)

In [42]:
import numpy as np
import scipy.io

# Split train data into train and validation gettting certain number of labels from each class
def train_val_split_count(x_train, y_train, size, seed):
    
    if seed != None:
        np.random.seed(seed)  # Set seed if it is stated.

    labels = set(y_train.flatten())  # Get label names
    n_labels = len(labels)  # Get number of labels

    x_val = []
    y_val = []
    split = []

    
    for i in labels:
        labels_i = np.where(y_train == i)[0]  # Take set of only one label
        samples = np.random.choice(labels_i, size)  # TODO: Check if enough labels in the class
        split.append(samples)

    split = np.array(split).flatten()
    #print(split[:10])

    x_val = np.array(x_train[split])
    y_val = np.array(y_train[split])
    
    x_train = np.delete(x_train, split, axis=0)
    y_train = np.delete(y_train, split, axis=0)
    
    return (x_train, x_val, y_train, y_val)

    
 



def load_data_svhn(seed = None):
    

    # Load in MatLab matrices
    test_mat = scipy.io.loadmat('test_32x32.mat')
    train_mat = scipy.io.loadmat('train_32x32.mat')
    extra_mat = scipy.io.loadmat('extra_32x32.mat')


    # Get data from matrices
    x_test = test_mat.get('X')  #numpy arrays
    y_test = test_mat.get('y')

    x_train = train_mat.get('X')
    y_train = train_mat.get('y')

    x_extra = extra_mat.get('X')
    y_extra = extra_mat.get('y')
    

    # Reshape the matrices

    # [h,w,channels,samples] -> [samples,h,w,channels]
    
    x_test = np.transpose(x_test, axes=(3,0,1,2))
    x_train = np.transpose(x_train, axes=(3,0,1,2))
    x_extra = np.transpose(x_extra, axes=(3,0,1,2))


    # Split DATA
    x_train1, x_val1, y_train1, y_val1 = train_val_split_count(x_train, y_train, size = 400, seed = seed)
    x_extra2, x_val2, y_extra2, y_val2 = train_val_split_count(x_extra, y_extra, size = 200, seed = seed)


    # Add together train and extra data

    x_train_all = np.concatenate([x_train1, x_extra2])
    y_train_all = np.concatenate([y_train1, y_extra2])
    
    y_train_all -= 1  # So 0 would be smallest label and 9 biggest.
    #NB! Note that this way the labels are not actually correct, because 10 indicates the 0, FIX this.

    x_val_all = np.concatenate([x_val1, x_val2])
    y_val_all = np.concatenate([y_val1, y_val2])
    
    y_val_all -= 1  # So 0 would be smallest label and 9 biggest
    y_test -= 1  # So 0 would be smallest label and 9 biggest


    return ((x_train_all, y_train_all), (x_val_all, y_val_all), (x_test, y_test))

In [44]:
def color_preprocessing(x_train, x_val, x_test):
    
    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')    
    x_test = x_test.astype('float32')
    
    mean = np.mean(x_train, axis=(0,1,2))  # Per channel mean
    std = np.std(x_train, axis=(0,1,2))
    x_train = (x_train - mean) / std
    x_val = (x_val - mean) / std
    x_test = (x_test - mean) / std
    
    return x_train, x_val, x_test
  
learning_rate = 0.1
nb_epochs = 50
batch_size = 128
nb_classes = 10
seed = 333
layers = 152 # n = 25 (152-2)/6


# data
print("Loading data, may take some time and memory!")
(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_data_svhn(seed = seed)
print(x_train.shape)
print("Data loaded")

x_train, x_val, x_test = color_preprocessing(x_train, x_val, x_test)  # Per channel mean


# Try with ImageDataGenerator, otherwise it takes massive amount of memory
img_gen = ImageDataGenerator(
    data_format="channels_last"
)

img_gen.fit(x_train)


y_train = np_utils.to_categorical(y_train, nb_classes)  # 1-hot vector
y_val = np_utils.to_categorical(y_val, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)

Loading data, may take some time and memory!
(598526, 32, 32, 3)
Data loaded


In [45]:
# building and training net
model = resnet_sd.resnet_sd_model(img_shape = (32,32), img_channels = 3, 
                        layers = layers, nb_classes = nb_classes, verbose = False)

temp_scaling(model,'resnet_sd_svhn.h5','resnet_sd_svhn',x_val,x_test,y_val,y_test)
temp_scaling(model,'resnet_sd_svhn_2.h5','resnet_sd_svhn_2',x_val,x_test,y_val,y_test)

Test: Error 2.301014; ece 0.008141; mce 0.316229; loss 0.099291, brier 0.022560
Val: Error 3.183333; ece 0.004195; mce 0.157624; loss 0.117429, brier 0.029506
Test: Error 2.769668; ece 0.006915; mce 0.197043; loss 0.112414, brier 0.026258
Val: Error 3.466667; ece 0.004958; mce 0.167990; loss 0.125080, brier 0.031592


(2.7696681007990094,
 0.019507523587164725,
 0.16301905051717225,
 0.11796248997539165,
 0.028432896422701158)

In [46]:
temp_scaling(model,'model_svhn_best.hdf5','resnet_sd_svhn_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_svhn_best_2.hdf5','resnet_sd_svhn_2_best',x_val,x_test,y_val,y_test)

Test: Error 4.701905; ece 0.009102; mce 0.320366; loss 0.176724, brier 0.045495
Val: Error 5.033333; ece 0.007897; mce 0.803988; loss 0.171608, brier 0.047293
Test: Error 4.310080; ece 0.012458; mce 0.147075; loss 0.169260, brier 0.042970
Val: Error 4.866667; ece 0.005398; mce 0.110209; loss 0.169226, brier 0.045257


(4.310079901659492,
 0.027316260308979833,
 0.1752937728608096,
 0.17610916402339824,
 0.04678971112771387)

In [47]:
temp_scaling(model,'model_svhn_best_cont.hdf5','resnet_sd_svhn_best_cont',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_svhn_best_2_cont.hdf5','resnet_sd_svhn_2_best_cont',x_val,x_test,y_val,y_test)

Test: Error 2.377843; ece 0.007606; mce 0.256758; loss 0.100569, brier 0.023051
Val: Error 3.083333; ece 0.003926; mce 0.161668; loss 0.117169, brier 0.029404
Test: Error 2.769668; ece 0.007038; mce 0.187466; loss 0.112593, brier 0.026373
Val: Error 3.416667; ece 0.005528; mce 0.234899; loss 0.125653, brier 0.031935


(2.7696681007990094,
 0.02332488790534544,
 0.21633289485092622,
 0.12061749820046395,
 0.029346950305191098)

In [49]:
depth              = 34  # 32, if ignoring conv layers carrying residuals, which are needed for increasing filter size.
growth_rate        = 10  # Growth factor
n                  = (depth-4)//6
num_classes        = 10
img_rows, img_cols = 32, 32
img_channels       = 3
batch_size         = 128
epochs             = 200
iterations         = 45000 // batch_size
weight_decay       = 0.0005
seed = 333

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# color preprocessing
x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=seed)  # random_state = seed
x_train45, x_val, x_test = color_preprocessing(x_train45, x_val, x_test)    

y_train45 = keras.utils.to_categorical(y_train45, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# build network
img_input = Input(shape=(img_rows,img_cols,img_channels))    
model = wrn.create_wide_residual_network(img_input, nb_classes=num_classes, N=n, k=growth_rate, dropout=0.0)

temp_scaling(model,'resnet_wide_cifar10.h5','resnet_wide_cifar10',x_val,x_test,y_val,y_test)
temp_scaling(model,'resnet_wide_cifar10_2.h5','resnet_wide_cifar10_2',x_val,x_test,y_val,y_test)

Wide Residual Network-34-10 created.
Test: Error 5.930000; ece 0.005796; mce 0.242373; loss 0.185372, brier 0.052319
Val: Error 5.540000; ece 0.004356; mce 0.218576; loss 0.171936, brier 0.048427
Test: Error 6.040000; ece 0.005165; mce 0.187059; loss 0.182160, brier 0.052785
Val: Error 5.560000; ece 0.007672; mce 0.251901; loss 0.168101, brier 0.048308


(6.040000000000006,
 0.04464357871413235,
 0.3518637418746948,
 0.3407603477393879,
 0.053745594327139955)

In [50]:
temp_scaling(model,'model_wide_28_10_c10_best.hdf5','wide_cifar10_best',x_val,x_test,y_val,y_test)
temp_scaling(model,'model_wide_28_10_c10_best_2.hdf5','wide_cifar10_2_best',x_val,x_test,y_val,y_test)

Test: Error 9.090000; ece 0.008489; mce 0.107472; loss 0.265302, brier 0.079169
Val: Error 8.200000; ece 0.010146; mce 0.130692; loss 0.247654, brier 0.073278
Test: Error 6.510000; ece 0.005866; mce 0.752218; loss 0.191964, brier 0.056043
Val: Error 5.940000; ece 0.007977; mce 0.251501; loss 0.172944, brier 0.050115


(6.510000000000005,
 0.044923282471299175,
 0.7162050306797028,
 0.3091214689021283,
 0.05643797845024942)

In [51]:
depth              = 34  # 32, if ignoring conv layers carrying residuals, which are needed for increasing filter size.
growth_rate        = 10  # Growth factor
n                  = (depth-4)//6
num_classes        = 100
img_rows, img_cols = 32, 32
img_channels       = 3
batch_size         = 128
epochs             = 200
iterations         = 45000 // batch_size
weight_decay       = 0.0005
seed = 333

(x_train, y_train), (x_test, y_test) = cifar100.load_data()

# color preprocessing
x_train45, x_val, y_train45, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=seed)  # random_state = seed
x_train45, x_val, x_test = color_preprocessing(x_train45, x_val, x_test)    

y_train45 = keras.utils.to_categorical(y_train45, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# build network
img_input = Input(shape=(img_rows,img_cols,img_channels))    
model = wrn.create_wide_residual_network(img_input, nb_classes=num_classes, N=n, k=growth_rate, dropout=0.0)

temp_scaling(model,'resnet_wide_cifar100.h5','resnet_wide_cifar100',x_val,x_test,y_val,y_test)
temp_scaling(model,'resnet_wide_cifar100_2.h5','resnet_wide_cifar100_2',x_val,x_test,y_val,y_test)

Wide Residual Network-34-10 created.
Test: Error 25.720000; ece 0.015987; mce 0.066368; loss 0.946586, brier 0.258849
Val: Error 25.660000; ece 0.015559; mce 0.055260; loss 0.915589, brier 0.254198
Test: Error 25.830000; ece 0.019773; mce 0.064194; loss 0.942272, brier 0.258776
Val: Error 24.820000; ece 0.011945; mce 0.059819; loss 0.915681, brier 0.251552


(25.83,
 0.18215142931938166,
 0.45353270836772286,
 1.7764153852039823,
 0.23724542406314614)

In [52]:
temp_scaling(model,'model_wide_28_10_c100_best.hdf5','wide_cifar100_best',x_val,x_test,y_val,y_test)

Test: Error 32.800000; ece 0.011115; mce 0.032339; loss 1.155700, brier 0.324982
Val: Error 31.300000; ece 0.014966; mce 0.089322; loss 1.129613, brier 0.317525


ValueError: Cannot assign value to variable ' dense_15/kernel:0': Shape mismatch.The variable shape (640, 100), and the assigned value shape (640, 10) are incompatible.

In [55]:
temp_scaling(model,'model_wide_28_10_c100_best_2.hdf5','wide_cifar100_2_best',x_val,x_test,y_val,y_test)

Test: Error 32.480000; ece 0.017687; mce 0.077031; loss 1.131136, brier 0.323099
Val: Error 31.540000; ece 0.017440; mce 0.084228; loss 1.109766, brier 0.316546


(32.480000000000004,
 0.0976238109499216,
 0.17100721828941734,
 1.216549081005753,
 0.2995340276126886)