# Benchmark accuracy notebook.

The results in this notebook represent the benchmark accuracy of the modelling.

The results are calculated using the 7 subregion time series in the South West.  

> **The notebook generates Tables 3 and 4 in the paper.**

* Table 3 file name: ./paper/tables/table3.tex  (horizon summary)
* Table 4 file name: ./paper/tables/table4.tex  (forecast distribution by region)

In [None]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

from scipy.stats import norm, t
import numpy as np

In [None]:
os.getcwd()
#should be in top level

In [None]:
#path to where I want tables saved.
TABLE_PATH = './paper/tables/'

# Read in individual results files.

In [None]:
mypath = './results/benchmark/'
result_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
result_files

In [None]:
results_mean = pd.DataFrame()
results_med = pd.DataFrame()
results_mean_std = pd.DataFrame()

#hold all mase results from all splits by horizon (Columns)
results_mase = pd.DataFrame()
results_cover_95 = pd.DataFrame()
results_cover_80 = pd.DataFrame()

In [None]:
error_measures = ['smape', 'rmse', 'mase', 'coverage_60', 'coverage_70', 
                  'coverage_80', 'coverage_90', 'coverage_95']

In [None]:
# read in by error metric
for metric in error_measures:
    to_read = [filename for filename in result_files if metric in filename]
    model_names = [name[:name.index('_')] for name in to_read]
    
    for filename, model_name in zip(to_read, model_names):
        df = pd.read_csv(mypath + filename, index_col=0)

        prefix = model_name + '_' + metric
        results_mean[prefix + '_mean'] = df.mean()
        results_mean[prefix  + '_std'] = df.std()
        results_med[prefix + '_med'] = df.median()
        results_med[prefix + '_iqr'] = df.quantile(0.75) - df.quantile(0.25)
        
        results_mean_std[prefix] = results_mean[prefix + '_mean'].map('{:,.2f}'.format) \
            + ' (' + results_mean[prefix  + '_std'].map('{:,.2f}'.format) + ')'
        
        #get all mase results ignoring trust level
        if filename[:5] != 'Trust':
            if metric == 'mase':
                results_mase = pd.concat([results_mase, df.copy()])
            elif metric == 'coverage_80':
                results_cover_80 = pd.concat([results_cover_80, df.copy()])
            elif metric == 'coverage_95':
                results_cover_95 = pd.concat([results_cover_95, df.copy()])

In [None]:
def remove_invalid(df, coverage):
    '''Only includes the sub region results.  Trust level is excluded.'''
    
    valid = ['Cornwall', 'Devon', 'Dorset', 'Wiltshire', 'BNSSG', 
             'Gloucestershire', 'Somerset']
    
    valid = [s + '-fbp-arima_coverage_' + coverage + '_mean' for s in valid]
    
    return df[valid]

In [None]:
## Is this section still needed?

In [None]:
#Variability of PI Coverage across regions in the South West of England

days = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84]

summary_60 = results_mean.filter(like="coverage_60").filter(like="fbp-").filter(like='mean')
##########
summary_60 = remove_invalid(summary_60, '60')
#########

summary_70 = results_mean.filter(like="coverage_70").filter(like="fbp-").filter(like='mean')
##########
#this limits the results to subregions and excludes the trust.
summary_70 = remove_invalid(summary_70, '70')
#########

summary_80 = results_mean.filter(like="coverage_80").filter(like="fbp-").filter(like='mean')
##########
summary_80 = remove_invalid(summary_80, '80')
#########


summary_90 = results_mean.filter(like="coverage_90").filter(like="fbp-").filter(like='mean')
##########
summary_90 = remove_invalid(summary_90, '90')
#########


summary_95 = results_mean.filter(like="coverage_95").filter(like="fbp-").filter(like='mean')
##########
summary_95 = remove_invalid(summary_95, '95')
#########


# Table 4: Prediction Interval coverage by region. - Full forecast dist.

In [None]:
def coverage_summary(data, alpha=0.05):
    mean = data.mean()
    std = data.std(ddof=1)
    n = data.shape[0]
    se = std / np.sqrt(n)
    z = np.abs(t.ppf(alpha / 2, n - 1))
    hw = z * se
    lower = mean - hw
    upper = mean + hw
    ci = pd.DataFrame([mean, lower, upper]).T
    ci.columns = ['mean', 'lci', 'uci']
    
    labels = list(ci.index)
    post_fix = data.columns[0][-7:]
    labels = [s.replace('-fbp-arima_coverage_' + post_fix, '') for s in labels]
    ci.index = labels
    return ci

In [None]:
def convert_to_single_columns(summary_frame, coverage):    
    summary_frame[f'{coverage}%'] = summary_frame['mean'].map('{:,.3f}'.format) \
            + ' (' + summary_frame['lci'].map('{:,.3f}'.format) +  ' - ' + \
                summary_frame['uci'].map('{:,.3f}'.format)+ ')'
    
    return summary_frame.drop(['mean', 'lci', 'uci'], axis=1).sort_index()

In [None]:
def single_data_frame(coverage_frames):
    summary_frame = coverage_frames[0]
    
    for i in range(1, len(coverage_frames)):
        summary_frame = pd.concat([summary_frame, coverage_frames[i]], 
                                  ignore_index=True, axis=1)
        
        
    return summary_frame

In [None]:
coverage_frames = []

targets = ['60', '70', '80', '90', '95']
labels = ['60%', '70%', '80%', '90%', '95%']
for target in targets:
    raw = results_mean.filter(like=f"coverage_{target}").filter(like="fbp-").filter(like='mean')
    
    ##########
    raw = remove_invalid(raw, target)
    #########
    
    summary_frame = coverage_summary(raw)
    coverage_frame = convert_to_single_columns(summary_frame, target)
    coverage_frames.append(coverage_frame)
    
#single_data_frame(coverage_frames)

summary_frame = pd.concat(coverage_frames, ignore_index=True, axis=1)
summary_frame.columns = labels

#drop Trust as not necessary for benchmark
#summary_frame = summary_frame.drop(['Trust'])

summary_frame

In [None]:
summary_frame.to_latex(f'{TABLE_PATH}table4.tex')

In [None]:
#is this section still needed?

In [None]:
def remove_invalid(df, metric):
    valid = ['Cornwall', 'Devon', 'Dorset', 'Wiltshire', 'BNSSG', 
             'Gloucestershire', 'Somerset']
    
    valid = [s + '-fbp-arima_' + metric + '_mean' for s in valid]
    print(valid)
    return df[valid]

In [None]:
#Variability of MASE across regions in the South West of England

fig, ax = plt.subplots(1, 1, figsize=(12,4), sharey='row')

days = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84]

summary_mase = results_mean.filter(like="mase").filter(like="fbp-").filter(like='mean')

##########
summary_mase = remove_invalid(summary_mase, 'mase')
#########

ax.boxplot(x=summary_mase, labels=days);

ax.xaxis.grid(False)
ax.set_xlabel('horizon (days)')


#fig.savefig('mase_cv_by_region.png', dpi=300, bbox_inches='tight')

In [None]:
#summary of mase by forecast horizon - pool the regions (trust level excluded)
#NOTE THESE ARE a summary of MEAN MASE for each forecast horizon.

alpha = 0.05

mean = summary_mase.mean(axis=1)
std = summary_mase.std(axis=1, ddof=1)
median = summary_mase.quantile(0.5, axis=1)
lowerq = summary_mase.quantile(0.25, axis=1)
upperq = summary_mase.quantile(0.75, axis=1)
maximum = summary_mase.max(axis=1)
minimum = summary_mase.min(axis=1)
per_5 = summary_mase.quantile(0.05, axis=1)
per_95 = summary_mase.quantile(0.95, axis=1)

n = 7

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

horizon_results = pd.DataFrame(mean, columns=['mean'])
horizon_results['lower'] = lower
horizon_results['upper'] = upper
horizon_results['median'] = median
horizon_results['lowerq'] = lowerq
horizon_results['upperq'] = upperq
horizon_results['upperq'] = upperq
horizon_results[r'$P_5$'] = per_5.map('{:,.2f}'.format)
horizon_results[r'$P_{95}$'] = per_95.map('{:,.2f}'.format)

horizon_results['mean (95% CI)'] = horizon_results['mean'].map('{:,.2f}'.format) \
            + ' (' + horizon_results['lower'].map('{:,.2f}'.format) +  ' - ' + \
                horizon_results['upper'].map('{:,.2f}'.format)+ ')'

horizon_results['median (IQR)'] = horizon_results['median'].map('{:,.2f}'.format) \
            + ' (' + horizon_results['lowerq'].map('{:,.2f}'.format) +  ' - ' + \
                horizon_results['upperq'].map('{:,.2f}'.format)+ ')'

columns = horizon_results.columns[-2:].to_list()
columns += horizon_results.columns[-4:-2].to_list()
horizon_results[columns]

# Table 3: A summary of forecast accuracy by horizon (all regions pooled)

In [None]:
#analyse using data from ALL splits for all regions

#summary of mase by forecast horizon - pool the regions (trust level excluded)
alpha = 0.05

summary_mase = results_mase

mean = summary_mase.mean(axis=0)
std = summary_mase.std(axis=0, ddof=1)
median = summary_mase.quantile(0.5, axis=0)
lowerq = summary_mase.quantile(0.25, axis=0)
upperq = summary_mase.quantile(0.75, axis=0)
maximum = summary_mase.max(axis=0)
minimum = summary_mase.min(axis=0)
per_5 = summary_mase.quantile(0.05, axis=0)
per_95 = summary_mase.quantile(0.95, axis=0)

n = results_mase.shape[0]

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

horizon_results = pd.DataFrame(mean, columns=['mean'])
horizon_results['lower'] = lower
horizon_results['upper'] = upper
horizon_results['median'] = median
horizon_results['lowerq'] = lowerq
horizon_results['upperq'] = upperq
horizon_results['upperq'] = upperq
horizon_results[r'$Q_5$'] = per_5.map('{:,.2f}'.format)
horizon_results[r'$Q_{95}$'] = per_95.map('{:,.2f}'.format)

horizon_results['mean (95% CI)'] = horizon_results['mean'].map('{:,.2f}'.format) \
            + ' (' + horizon_results['lower'].map('{:,.2f}'.format) +  ' - ' + \
                horizon_results['upper'].map('{:,.2f}'.format)+ ')'

horizon_results['median (IQR)'] = horizon_results['median'].map('{:,.2f}'.format) \
            + ' (' + horizon_results['lowerq'].map('{:,.2f}'.format) +  ' - ' + \
                horizon_results['upperq'].map('{:,.2f}'.format)+ ')'

horizon_results.index.name = 'h'

columns = horizon_results.columns[-2:].to_list()
columns += horizon_results.columns[-4:-2].to_list()
horizon_mase = horizon_results[columns]

In [None]:
#summary of COVERAGE 80% by forecast horizon - pool the regions (trust level excluded)
alpha = 0.05

mean = results_cover_80.mean(axis=0)
std = results_cover_80.std(axis=0, ddof=1)
median = results_cover_80.quantile(0.5, axis=0)
lowerq = results_cover_80.quantile(0.25, axis=0)
upperq = results_cover_80.quantile(0.75, axis=0)
maximum = results_cover_80.max(axis=0)
minimum = results_cover_80.min(axis=0)
per_5 = results_cover_80.quantile(0.05, axis=0)
per_95 = results_cover_80.quantile(0.95, axis=0)

n = results_cover_80.shape[0]


#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

horizon_results = pd.DataFrame(mean, columns=['mean'])
horizon_results['lower'] = lower
horizon_results['upper'] = upper
horizon_results['median'] = median
horizon_results['lowerq'] = lowerq
horizon_results['upperq'] = upperq
horizon_results['upperq'] = upperq
horizon_results[r'$Q_5$'] = per_5.map('{:,.2f}'.format)
horizon_results[r'$Q_{95}$'] = per_95.map('{:,.2f}'.format)

horizon_results['mean (95% CI)'] = horizon_results['mean'].map('{:,.3f}'.format) \
            + ' (' + horizon_results['lower'].map('{:,.3f}'.format) +  ' - ' + \
                horizon_results['upper'].map('{:,.3f}'.format)+ ')'

horizon_results['median (IQR)'] = horizon_results['median'].map('{:,.2f}'.format) \
            + ' (' + horizon_results['lowerq'].map('{:,.2f}'.format) +  ' - ' + \
                horizon_results['upperq'].map('{:,.2f}'.format)+ ')'

horizon_results.index.name = 'h'

columns = horizon_results.columns[-2:].to_list()
columns += horizon_results.columns[-4:-2].to_list()
horizon_80 = horizon_results[columns]

In [None]:
#summary of COVERAGE 95% by forecast horizon - pool the regions (trust level excluded)
alpha = 0.05

mean = results_cover_95.mean(axis=0)
std = results_cover_95.std(axis=0, ddof=1)
median = results_cover_95.quantile(0.5, axis=0)
lowerq = results_cover_95.quantile(0.25, axis=0)
upperq = results_cover_95.quantile(0.75, axis=0)
maximum = results_cover_95.max(axis=0)
minimum = results_cover_95.min(axis=0)
per_5 = results_cover_95.quantile(0.05, axis=0)
per_95 = results_cover_95.quantile(0.95, axis=0)

n = results_cover_95.shape[0]

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

horizon_results = pd.DataFrame(mean, columns=['mean'])
horizon_results['lower'] = lower
horizon_results['upper'] = upper
horizon_results['median'] = median
horizon_results['lowerq'] = lowerq
horizon_results['upperq'] = upperq
horizon_results['upperq'] = upperq
horizon_results[r'$Q_5$'] = per_5.map('{:,.2f}'.format)
horizon_results[r'$Q_{95}$'] = per_95.map('{:,.2f}'.format)

horizon_results['mean (95% CI)'] = horizon_results['mean'].map('{:,.3f}'.format) \
            + ' (' + horizon_results['lower'].map('{:,.3f}'.format) +  ' - ' + \
                horizon_results['upper'].map('{:,.3f}'.format)+ ')'

horizon_results['median (IQR)'] = horizon_results['median'].map('{:,.3f}'.format) \
            + ' (' + horizon_results['lowerq'].map('{:,.3f}'.format) +  ' - ' + \
                horizon_results['upperq'].map('{:,.3f}'.format)+ ')'

horizon_results.index.name = 'h'

columns = horizon_results.columns[-2:].to_list()
columns += horizon_results.columns[-4:-2].to_list()
horizon_95 = horizon_results[columns]

In [None]:
horizon_results = pd.concat([horizon_mase['mean (95% CI)'], 
                             horizon_80['mean (95% CI)'],
                             horizon_95['mean (95% CI)']], axis=1)

horizon_results.columns = ['MASE', 'Coverage 80%', 
                           'Coverage 95%']

horizon_results.to_latex(f'{TABLE_PATH}table3.tex')
horizon_results

# Produce an overall mean and 'rule of thumb' benchmark for forecasters

## MASE

In [None]:
alpha = 0.05

#overall MASE
mean = summary_mase.to_numpy().flatten().mean()
print(mean)

#sample std
std = summary_mase.to_numpy().flatten().std(ddof=1)

n = summary_mase.to_numpy().flatten().shape[0]

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

#lower and upper 95% CI
print(lower, upper)

#median
print(np.percentile(summary_mase.to_numpy().flatten(), 50))
print(np.percentile(summary_mase.to_numpy().flatten(), 75) - np.percentile(summary_mase.to_numpy().flatten(), 25))

#middle 90% of data lies between
print(np.percentile(summary_mase.to_numpy().flatten(), 5))
print(np.percentile(summary_mase.to_numpy().flatten(), 95))

plt.hist(summary_mase.to_numpy().flatten());

## sMAPE

In [None]:
#Variability of sMAPE across regions in the South West of England

fig, ax = plt.subplots(1, 1, figsize=(12,4), sharey='row')

days = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84]

summary_smape = results_mean.filter(like="smape").filter(like="fbp-").filter(like='mean')
ax.boxplot(x=summary_smape, labels=days);

ax.xaxis.grid(False)
ax.set_xlabel('horizon (days)')

#fig.savefig('mase_cv_by_region.png', dpi=300, bbox_inches='tight')

In [None]:
alpha = 0.05

#overall MASE
mean = summary_smape.to_numpy().flatten().mean()
print(mean)

#sample std
std = summary_smape.to_numpy().flatten().std(ddof=1)
print(std)

n = summary_smape.to_numpy().flatten().shape[0]

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

#lower and upper 95% CI
print(lower, upper)

#median and IQR
print(np.percentile(summary_smape.to_numpy().flatten(), 50))
print(np.percentile(summary_smape.to_numpy().flatten(), 75) - np.percentile(summary_smape.to_numpy().flatten(), 25))

#middle 90% of data lies between
print(np.percentile(summary_smape.to_numpy().flatten(), 5))
print(np.percentile(summary_smape.to_numpy().flatten(), 95))

In [None]:
#Variability of coverage 80 across regions in the South West of England

fig, ax = plt.subplots(1, 1, figsize=(12,4), sharey='row')

days = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84]

summary_coverage_80 = results_mean.filter(like="coverage_80").filter(like="fbp-").filter(like='mean')
ax.boxplot(x=summary_coverage_80, labels=days);

ax.xaxis.grid(False)
ax.set_xlabel('horizon (days)')

#fig.savefig('mase_cv_by_region.png', dpi=300, bbox_inches='tight')

In [None]:
alpha = 0.05

#overall cover 80
mean = summary_coverage_80.to_numpy().flatten().mean()
print(mean)

#sample std
std = summary_coverage_80.to_numpy().flatten().std(ddof=1)

n = summary_coverage_80.to_numpy().flatten().shape[0]

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

#lower and upper 95% CI
print(lower, upper)

#median and IQR
print(np.percentile(summary_coverage_80.to_numpy().flatten(), 50))
print(np.percentile(summary_coverage_80.to_numpy().flatten(), 75) - np.percentile(summary_coverage_80.to_numpy().flatten(), 25))

#middle 90% of data lies between
print(np.percentile(summary_coverage_80.to_numpy().flatten(), 5))
print(np.percentile(summary_coverage_80.to_numpy().flatten(), 95))

In [None]:
#Variability of coverage 95 across regions in the South West of England

fig, ax = plt.subplots(1, 1, figsize=(12,4), sharey='row')

days = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84]

summary_coverage_95 = results_mean.filter(like="coverage_95").filter(like="fbp-").filter(like='mean')
ax.boxplot(x=summary_coverage_95, labels=days);

ax.xaxis.grid(False)
ax.set_xlabel('horizon (days)')

#fig.savefig('mase_cv_by_region.png', dpi=300, bbox_inches='tight')

In [None]:
alpha = 0.05

#overall cover 95
mean = summary_coverage_95.to_numpy().flatten().mean()
print(mean)

#sample std
std = summary_coverage_95.to_numpy().flatten().std(ddof=1)

n = summary_coverage_95.to_numpy().flatten().shape[0]

#Confidence interval calculation
se = std / np.sqrt(n)
z = np.abs(t.ppf(alpha / 2, n - 1))
hw = z * se
lower = mean - hw
upper = mean + hw

#lower and upper 95% CI
print(lower, upper)

#median and IQR
print(np.percentile(summary_coverage_95.to_numpy().flatten(), 50))
print(np.percentile(summary_coverage_95.to_numpy().flatten(), 75) - np.percentile(summary_coverage_95.to_numpy().flatten(), 25))

#middle 90% of data lies between
print(np.percentile(summary_coverage_95.to_numpy().flatten(), 5))
print(np.percentile(summary_coverage_95.to_numpy().flatten(), 95))

# Overall accuracy summary by region (not in paper)

In [None]:
region_means = results_mean.filter(like="mase").filter(like='mean').mean().sort_index()
region_std = results_mean.filter(like="mase").filter(like='std').mean().sort_index()
region_95_mean = results_mean.filter(like="coverage_95").filter(like='mean').mean().sort_index()
region_80_mean = results_mean.filter(like="coverage_80").filter(like='mean').mean().sort_index()
region_95_std = results_mean.filter(like="coverage_95").filter(like='std').mean().sort_index()
region_80_std = results_mean.filter(like="coverage_80").filter(like='std').mean().sort_index()

In [None]:
comparisons = list(region_means.index)
idx = [i.replace('_mase_mean', '') for i in comparisons]

In [None]:
df_regions = pd.DataFrame(region_means.to_numpy(), columns=['mean'])
df_regions['std'] = region_std.to_numpy()
df_regions['mean_80'] = region_80_mean.to_numpy()
df_regions['std_80'] = region_80_std.to_numpy()
df_regions['mean_95'] = region_95_mean.to_numpy()
df_regions['std_95'] = region_95_std.to_numpy()
df_regions['MASE'] = df_regions['mean'].map('{:,.2f}'.format) \
            + ' (' + df_regions['std'].map('{:,.2f}'.format) + ')'

df_regions['Coverage 80'] = df_regions['mean_80'].map('{:,.2f}'.format) \
            + ' (' + df_regions['std_80'].map('{:,.2f}'.format) + ')'

df_regions['Coverage 95'] = df_regions['mean_95'].map('{:,.2f}'.format) \
            + ' (' + df_regions['std_95'].map('{:,.2f}'.format) + ')'

df_regions.index = idx
df_regions = df_regions.drop(['mean', 'std', 'mean_80', 'std_80', 
                              'mean_95', 'std_95'], axis=1)

idx = df_regions.index
labels = [s.replace('-fbp-arima', '') for s in idx]
df_regions.index = labels
df_regions