In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, Normalize
from matplotlib.ticker import FixedLocator

d = '../../RISC/'

k = 25  # number of ensembles run
sigma = 3
COLS = ('n small', 'n medium', 'n large')
observations = pd.read_csv(d + 'farm-size-year.csv')
# drivers are in the order given in fig 10 of the paper
drivers = (
          (False, False, False, False), (False, False, False, True),
          (False, False, True, True), (False, False, True, False),
          (False, True, False, True), (False, True, False, False),
          (False, True, True, True), (False, True, True, False),
          (True, False, False, True), (True, False, False, False),
          (True, False, True, True), (True, False, True, False),
          (True, True, False, True), (True, True, False, False),
          (True, True, True, True), (True, True, True, False))


# set automatically
v_ens = 0
m_ens = 0


def get_fp(driver):
    # Get the filepath of a driver
    fp = d + 'results/farm-ensembles-'
    fp = fp + 'true-' if driver[0] else fp + 'false-'
    fp = fp + 'true-' if driver[1] else fp + 'false-'
    fp = fp + 'true-' if driver[2] else fp + 'false-'
    fp = fp + 'true-' if driver[3] else fp + 'false-'
    return fp + '1.csv'

First, create functions to calculate the variance of each model output across 25 ensemble runs.
The variance of a given output is calculated by measuring the correlation of each ensemble run compared with each other run (for a given driver). The variance across these comparisons is then calculated. The average variance across all drivers is then used for history matching.
From the literature, it seems that you generally have a separate evaluation of ensemble variance per output.


The correlation c gives a result such that 0 <= |c| <= 1. I've found this was not ideal as you start dividing small numbers (i.e. <1) by other small numbers, ending up with tiny implausibility scores making everything "plausible". I've converted all measures of correlation in the model to a measure of error in the range [0, 100], **but, of course, this may need to be reconsidered.**. The error is e = (1 - c) * 50, such that 0 indicates no error (perfect positive correlation), 50 indicates no correlation, and 100 inidicates perfect negative correlation.

**I notice that if I instead normalise error to the range [0, 1], then I need to use a implausibility threshold of 0.3 instead of 3. All papers I've seen use a threshold of 3, so I find this curious.**

In [2]:
def v_ens_X(driver_indexes):
    """ Calculate the average variance across plausible ensemble runs."""
    results = dict((c, 0) for c in COLS)
    for di in driver_indexes:
        results_x = v_ens_x_corr(drivers[di])
        for col in COLS:
            results[col] += results_x[col]
    return dict((c, results[c]/len(driver_indexes)) for c in COLS)


def v_ens_x_corr(driver):
    """ Calculate the variance of a plausible ensemble."""
    results = dict((c, []) for c in COLS)
    for col in COLS:
        r = []
        for i in range(k):
            df = pd.read_csv(get_fp(driver)[:-5] + str(i+1) + '.csv')
            r.append(df[col].tolist())
        for a in range(k):
            for b in range(a, k):
                results[col].append(stats.spearmanr(r[a], r[b])[0] * 50)
    return dict((c, np.var(results[c], ddof=1)) for c in COLS)


In [3]:
print(v_ens_X(range(len(drivers))))

{'n small': 0.002090608439150047, 'n medium': 1.3517347526384984, 'n large': 0.5342640874016333}


The variances are quite small.

Next, are the functions to calculate the implaubility of a given driver and to run waves of history matching.

In [4]:
def implaus(driver):
    """ Calculate the implausiblity of a set of parameters (driver)."""
    estimates = pd.read_csv(get_fp(driver))
    implaus = 0
    for col in COLS:
        this_obs = observations[col].tolist()
        this_est = estimates[col].tolist()
        diff = (1 - stats.spearmanr(this_obs, this_est).correlation) * 50
        implaus = max(implaus, diff / np.sqrt(v_ens[col]))
    return round(implaus, 2)


def wave(plaus_space):
    """ Run a wave of history matching.

        plaus_space: index of plausible drivers to test."""
    globals()['v_ens'] = v_ens_X(plaus_space)
    new_plaus_space = []
    implaus_scores = []
    for i in plaus_space:
        score = implaus(drivers[i])
        implaus_scores.append(score)
        if score < sigma:
            new_plaus_space.append(i)
    return new_plaus_space, implaus_scores


def run_history_matching():
    """ Run waves of history matching until the new plausible space
        is either empty (the whole space is implausible)
        or is unchanged from the previous wave.
    """
    plaus_space = []
    new_plaus_space = range(len(drivers))
    while len(new_plaus_space) > 0 and len(plaus_space) != len(new_plaus_space):
        plaus_space = new_plaus_space
        new_plaus_space, implaus_scores = wave(plaus_space)
        print('implausiblity scores: ', implaus_scores)
        print('new plausible space:', new_plaus_space)


In [5]:
run_history_matching()

implausiblity scores:  [2145.02, 2145.02, 2145.02, 2145.02, 2163.04, 2163.04, 2145.02, 2163.04, 85.91, 82.73, 80.65, 69.91, 42.06, 42.06, 42.06, 42.06]
new plausible space: []


An implausibility of less than 3 indicates that the run had a plausible set of a parameters.
These implausibility scores are all large because there is no variance in the ensembles and ensemble variance is the only uncertainty we've accounted for.
The model isn't perfect - there is some uncertainty in its ability to correctly model the empirical data.
Therefore, we must quantify the uncertainty in the model. This uncertainty is referred to as *model discrepency* or *model inadequacy*.

How do we do quantify model discrepency? My ideas are, we could calculate
1. One score across all plausible models and outputs (averaging their model discrepencies).
2. One score per output, averaging the uncertainty of predicting each output across all plausible models.
3. One score per model, averaging the uncertainty of the model's ability to predict all outputs.
4. One score per model and per output - quantifying each model's ability to predict each output.

The first two average model discrepancy across all models, the third and fourth have a separate score per model.
The first and third average over outputs, whilst second and fourth have separate scores per output.

Currently, ensemble variance is averaged across models but there's a separate ensemble variance measurement per output. This would fit with idea 2.

We'll start with the first in the list.
We measure model discrepency by calculating the error in correlation between the model results and the empirical data.

### 1. One score across all plausible models (averaging their model discrepencies)

For a given model, we calculate this error (compliment of correlation) for all three outputs and take the largest result to represent the uncertainty of that model.
We then average the model uncertainty across all plausible models.

We must also update the implaus and wave functions to include the new model discrepency term m_ens.

In [6]:
def m_ens_X(plaus_space):
    """ Calculate model discrepancy.
        Calculate the average model discrepency across all plausible models."""
    E = 0
    for di in plaus_space:
        e = 0
        df = pd.read_csv(get_fp(drivers[di]))
        for col in COLS:
            est = df[col].tolist()
            obs = observations[col].tolist()
            c = (1 - stats.spearmanr(est, obs).correlation) * 50
            # take the maximum uncertainty across the model outputs
            # This is a pessmistic approach;
            # the uncertainty in the whole model is based on
            # the greatest uncertainty of all outputs.
            e = max(e, c)
        E += e
    return E / len(plaus_space)

def implaus(driver):
    """ Calculate the implausiblity of a set of parameters (driver)."""
    estimates = pd.read_csv(get_fp(driver))
    implaus = 0
    for col in COLS:
        this_obs = observations[col].tolist()
        this_est = estimates[col].tolist()
        diff = (1 - stats.spearmanr(this_obs, this_est).correlation) * 50
        implaus = max(implaus,
                     diff / np.sqrt(v_ens[col] + m_ens))
    return round(implaus, 2)

def wave(plaus_space):
    """ Run a wave of history matching.

        plaus_space: index of plausible drivers to test."""
    globals()['v_ens'] = v_ens_X(plaus_space)
    globals()['m_ens'] = m_ens_X(plaus_space)
    new_plaus_space = []
    implaus_scores = []
    for i in plaus_space:
        score = implaus(drivers[i])
        implaus_scores.append(score)
        if score < sigma:
            new_plaus_space.append(i)
    return new_plaus_space, implaus_scores


run_history_matching()

implausiblity scores:  [11.853961294015207, 11.903215662968012, 11.903215662968012, 11.903215662968012, 11.953574414132982, 11.953574414132982, 11.853961294015207, 11.953574414132982, 7.559995629312552, 7.280194617899933, 7.097775162781711, 6.152510713532746, 2.2108599716477055, 2.1309909075174063, 2.1114952225889168, 2.0618128480595215]
new plausible space: [12, 13, 14, 15]
implausiblity scores:  [4.364117158681839, 4.206459976549616, 4.1679765564290445, 4.069906255302645]
new plausible space: []


In the first wave of history matching, we find that only the final four drivers are plausible. This matches the results in the paper.
In the second wave, none of the space is considered plausible.

### 2. One score per output, averaging the uncertainty of predicting each output across all plausible models

This method matches with how we measure ensemble variance.

In [9]:
def m_ens_X(plaus_space):
    """ Calculate model discrepancy.
        Calculate the average model discrepency across all plausible models."""
    results = dict((col, 0) for col in COLS)
    total_models = len(plaus_space)
    for di in plaus_space:
        df = pd.read_csv(get_fp(drivers[di]))
        for col in COLS:
            est = df[col].tolist()
            obs = observations[col].tolist()
            c = (1 - stats.spearmanr(est, obs).correlation) * 50
            # calculate the average (across all models) implausibility of the output 
            results[col] += (c / total_models)
    return results

def implaus(driver):
    """ Calculate the implausiblity of a set of parameters (driver)."""
    estimates = pd.read_csv(get_fp(driver))
    implaus = 0
    for col in COLS:
        this_obs = observations[col].tolist()
        this_est = estimates[col].tolist()
        diff = (1 - stats.spearmanr(this_obs, this_est).correlation) * 50
        implaus = max(implaus,
                     diff / np.sqrt(v_ens[col] + m_ens[col]))
    return round(implaus, 2)


run_history_matching()

implausiblity scores:  [14.008170770452926, 14.324739601423616, 14.324739601423616, 14.324739601423616, 13.964872359354736, 13.964872359354736, 13.84849842302678, 13.964872359354736, 12.201440336914766, 11.749856035235757, 11.455440508156956, 9.929832776930446, 3.568213178951998, 3.439308657238712, 3.407843634222041, 3.327658861857061]
new plausible space: []


Having a model discrepancy term per output (averaged across models) has resulted in an empty plausibility space.

### 3. One score per model, averaging the uncertainty of the model's ability to predict all outputs

In [7]:
def m_ens_X(plaus_space):
    """ Calculate model discrepancy.
        Calculate the average model discrepency across all plausible models."""
    results = dict((di, 0) for di in plaus_space)
    for di in plaus_space:
        df = pd.read_csv(get_fp(drivers[di]))
        for col in COLS:
            est = df[col].tolist()
            obs = observations[col].tolist()
            c = (1 - stats.spearmanr(est, obs).correlation) * 50
            # calculate the averaged implausibility of the output
            results[di] += (c / 3)
    return results

def implaus(driver):
    """ Calculate the implausiblity of a set of parameters (driver)."""
    estimates = pd.read_csv(get_fp(driver))
    implaus = 0
    for col in COLS:
        this_obs = observations[col].tolist()
        this_est = estimates[col].tolist()
        diff = (1 - stats.spearmanr(this_obs, this_est).correlation) * 50
        implaus = max(implaus,
                     diff / np.sqrt(v_ens[col] + m_ens[drivers.index(driver)]))
    return round(implaus, 2)


run_history_matching()

implausiblity scores:  [11.729980875343529, 11.73649275418363, 11.73649275418363, 11.73649275418363, 12.030172878357602, 11.98721104857536, 12.2731288808095, 12.013883469192224, 13.32763548309953, 13.064357199025919, 12.889850810825871, 11.945429394165693, 6.755937697519662, 6.611571253941545, 6.5758941235716115, 6.484170343445727]
new plausible space: []


Having a model discrepancy term per model (averaging uncertainty over outputs) has resulted in an empty plausibility space.

### 4. One score per model and per output - quantifying each model's ability to predict each output

In [8]:
def m_ens_X(plaus_space):
    """ Calculate model discrepancy.
        Calculate the average model discrepency across all plausible models."""
    results = dict((di, dict((col, 0) for col in COLS)) for di in plaus_space)
    for di in plaus_space:
        df = pd.read_csv(get_fp(drivers[di]))
        for col in COLS:
            est = df[col].tolist()
            obs = observations[col].tolist()
            c = (1 - stats.spearmanr(est, obs).correlation) * 50
            # calculate the averaged implausibility of the output
            results[di][col] = c
    return results

def implaus(driver):
    """ Calculate the implausiblity of a set of parameters (driver)."""
    estimates = pd.read_csv(get_fp(driver))
    implaus = 0
    for col in COLS:
        this_obs = observations[col].tolist()
        this_est = estimates[col].tolist()
        diff = (1 - stats.spearmanr(this_obs, this_est).correlation) * 50
        implaus = max(implaus,
                     diff / np.sqrt(v_ens[col] + m_ens[drivers.index(driver)][col]))
    return round(implaus, 2)


run_history_matching()

implausiblity scores:  [9.903273828035172, 9.905399595207578, 9.905399595207578, 9.905399595207578, 9.94479805410099, 9.94479805410099, 9.903273828035172, 9.94479805410099, 7.890662362852677, 7.742011016977813, 7.643539903044827, 7.1114831193218215, 4.224209607349607, 4.145010809292585, 4.125448621692247, 4.075173649066611]
new plausible space: []


Having a model discrepancy term per model and per output has resulted in an empty plausibility space.

Having tested four different methods of quantifying the uncertainty in the model, only having a single value of uncertainty (averaged across all models and all outputs) has produced promising results.
**Of course, this does not mean this is definitely the best approach. There may be something else that has been missed or should be considered.**