In [1]:
import numpy as np

In [2]:
def brier_score(confidence: list, true_labels: list) -> float:
    confidence = np.array(confidence)
    true_labels = np.array(true_labels)
    return np.mean((confidence - true_labels) ** 2)

In [4]:
def ece(conf, preds, labels, num_bins=2):
    conf, preds, labels = map(np.array, (conf, preds, labels))
    n = len(conf)
    order = np.argsort(-conf)  # sort by descending confidence
    conf, preds, labels = conf[order], preds[order], labels[order]

    bins = np.array_split(np.arange(n), num_bins)
    ece = 0.0
    for idxs in bins:
        bin_conf = np.mean(conf[idxs])
        bin_acc = np.mean(preds[idxs] == labels[idxs])
        ece += (len(idxs) / n) * abs(bin_acc - bin_conf)
    return ece


In [5]:
def macro_ce(conf, preds, labels, num_bins=2):
    conf, preds, labels = map(np.array, (conf, preds, labels))
    n = len(conf)
    order = np.argsort(-conf)
    conf, preds, labels = conf[order], preds[order], labels[order]

    bins = np.array_split(np.arange(n), num_bins)
    diffs = []
    for idxs in bins:
        bin_conf = np.mean(conf[idxs])
        bin_acc = np.mean(preds[idxs] == labels[idxs])
        diffs.append(abs(bin_acc - bin_conf))
    return np.mean(diffs)

In [6]:
sample_cases = [
    {
        "conf": [0.9, 0.9, 0.8, 0.8, 0.7, 0.7],
        "pred": [1, 1, 1, 1, 1, 1],
        "true": [1, 0, 1, 0, 1, 1],
    },
    {
        "conf": [0.9, 0.9, 0.9, 0.8, 0.8, 0.7],
        "pred": [1, 1, 1, 1, 1, 1],
        "true": [1, 1, 1, 0, 0, 0],
    },
    {
        "conf": [0.9, 0.9, 0.8, 0.8, 0.6, 0.6],
        "pred": [1, 1, 1, 1, 1, 1],
        "true": [1, 1, 1, 1, 1, 0],
    },
]

for case in sample_cases:
    conf, pred, true = case["conf"], case["pred"], case["true"]
    print("ECE:", ece(conf, pred, true, num_bins=2))
    print("Macro CE:", macro_ce(conf, pred, true, num_bins=2))
    print("Brier score:", brier_score(conf, true))
    print()

ECE: 0.13333333333333341
Macro CE: 0.13333333333333341
Brier score: 0.2800000000000001

ECE: 0.4333333333333333
Macro CE: 0.4333333333333333
Brier score: 0.30000000000000004

ECE: 0.06666666666666665
Macro CE: 0.06666666666666665
Brier score: 0.10333333333333333

