# Singular Spectrum Analysis

Note the actual analysis is done via the Rssa package in R.  This notebook reads in the cross validation csv files and calculates scores.

> See the R script `06_ssa-tscv.R` for SSA code that generates the CV folds

In [1]:
import numpy as np
import pandas as pd

import glob
import os

#error measures
from forecast_tools.metrics import (mean_absolute_scaled_error, 
                                    root_mean_squared_error,
                                    symmetric_mean_absolute_percentage_error)

# Data Input

The constants `TOP_LEVEL`, `STAGE`, `REGION`,`TRUST` and `METHOD` are used to control data selection and the directory for outputting results.  

> Output file is `f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv'.csv`.  where metric will be smape, rmse, mase, coverage_80 and coverage_95. Note: `REGION`: is also used to select the correct data from the input dataframe.

In [2]:
TOP_LEVEL = '../../../results/model_selection'
STAGE = 'stage1'
REGION = 'Trust'
METHOD = 'ssa'

FILE_NAME = 'Daily_Responses_5_Years_2019_full.csv'

#split training and test data.
TEST_SPLIT_DATE = '2019-01-01'

#second subdivide: train and val
VAL_SPLIT_DATE = '2017-07-01'

#discard data after 2020 due to coronavirus
#this is the subject of a seperate study.
DISCARD_DATE = '2020-01-01'

In [3]:
#read in path
path = f'../../../data/{FILE_NAME}'

In [4]:
def pre_process_daily_data(path, index_col, by_col, 
                           values, dayfirst=False):
    '''
    Daily data is stored in long format.  Read in 
    and pivot to wide format so that there is a single 
    colmumn for each regions time series.
    '''
    df = pd.read_csv(path, index_col=index_col, parse_dates=True, 
                     dayfirst=dayfirst)
    df.columns = map(str.lower, df.columns)
    df.index.rename(str(df.index.name).lower(), inplace=True)
    
    clean_table = pd.pivot_table(df, values=values.lower(), 
                                 index=[index_col.lower()],
                                 columns=[by_col.lower()], aggfunc=np.sum)
    
    clean_table.index.freq = 'D'
    
    return clean_table

In [5]:
clean = pre_process_daily_data(path, 'Actual_dt', 'ORA', 'Actual_Value', 
                               dayfirst=False)
clean.head()

ora,BNSSG,Cornwall,Devon,Dorset,Gloucestershire,OOA,Somerset,Trust,Wiltshire
actual_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-12-30,415.0,220.0,502.0,336.0,129.0,,183.0,2042.0,255.0
2013-12-31,420.0,236.0,468.0,302.0,128.0,,180.0,1996.0,260.0
2014-01-01,549.0,341.0,566.0,392.0,157.0,,213.0,2570.0,351.0
2014-01-02,450.0,218.0,499.0,301.0,115.0,,167.0,2013.0,258.0
2014-01-03,419.0,229.0,503.0,304.0,135.0,,195.0,2056.0,269.0


## Train Test Split

In [6]:
def ts_train_test_split(data, split_date):
    '''
    Split time series into training and test data
    
    Parameters:
    -------
    data - pd.DataFrame - time series data.  Index expected as datatimeindex
    split_date - the date on which to split the time series
    
    Returns:
    --------
    tuple (len=2) 
    0. pandas.DataFrame - training dataset
    1. pandas.DataFrame - test dataset
    '''
    train = data.loc[data.index < split_date]
    test = data.loc[data.index >= split_date]
    return train, test

In [7]:
train, test = ts_train_test_split(clean, split_date=TEST_SPLIT_DATE)

#exclude data after 2020 due to coronavirus.
test, discard = ts_train_test_split(test, split_date=DISCARD_DATE)

#train split into train and validation
train, val = ts_train_test_split(train, split_date=VAL_SPLIT_DATE)

## SSA read in data generated by Rssa R package

# 80% Prediction Interval

In [8]:
#read in file names
files = glob.glob(f'{os.getcwd()}/ssa/80_PI/*.csv')

In [9]:
def read_ssa_folds(files):
    '''
    Loop through files that represent TSCV folds
    and read in mean, lci, uci and actual
    '''
    cv_data = []
    for file in files:
        df = pd.read_csv(file, usecols=[1,2,3,4])
        df.columns = ['mean', 'lower', 'upper', 'actual']
        cv_data.append(df)
        
    return cv_data

In [10]:
cv_data = read_ssa_folds(files)

In [11]:
cv_data[0].head(7)

Unnamed: 0,mean,lower,upper,actual
0,2302.310259,2175.491727,2429.887828,2439
1,2326.161722,2209.291161,2464.374813,2363
2,2228.114358,2109.076665,2362.795965,2363
3,2155.571521,2043.872139,2285.992524,2320
4,2145.501888,2032.17643,2290.127476,2313
5,2139.926978,2020.446248,2284.139318,2223
6,2190.261035,2068.255134,2317.499263,2198


In [12]:
def preprocess_r_output(cv_data):
    '''transform the cv data so that it works with existing scoring code.
    
    Returns:
    --------
    tuple
        predictions, intervals, test data.
        
        Each of the above is a list of lists.
    '''
    horizons = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 365]
    cv_preds = []
    cv_intervals = []
    cv_test = []
    for cv in cv_data:
        h_cv = []
        cv_h_pis = []
        cv_h_test = []
        for h in horizons:
            h_cv.append(cv['mean'].iloc[:h].to_numpy())
            lower = cv['lower'].iloc[:h].to_numpy()
            upper = cv['upper'].iloc[:h].to_numpy()
            cv_h_pis.append(np.vstack([lower, upper]).T)
            cv_h_test.append(cv['actual'].iloc[:h].to_numpy())
        cv_preds.append(h_cv)
        cv_intervals.append(cv_h_pis)
        cv_test.append(cv_h_test)
        
    return cv_preds, cv_intervals, cv_test

In [13]:
#run preprocessing
cv_preds, cv_intervals, cv_test = preprocess_r_output(cv_data)

In [14]:
cv_preds[0][2]

array([2302.31025897, 2326.16172193, 2228.11435825, 2155.5715213 ,
       2145.5018879 , 2139.92697807, 2190.26103453, 2303.8770358 ,
       2328.83859822, 2231.69212475, 2159.64976046, 2149.84516168,
       2144.28229201, 2194.32080223, 2307.65311315, 2332.52228202,
       2235.16477668, 2162.63709078, 2152.26440484, 2146.05345523,
       2195.32591389])

In [15]:
cv_intervals[0][0]

array([[2175.49172707, 2429.88782811],
       [2209.2911606 , 2464.3748128 ],
       [2109.07666543, 2362.79596461],
       [2043.87213866, 2285.99252408],
       [2032.17642953, 2290.12747579],
       [2020.44624783, 2284.13931758],
       [2068.25513376, 2317.49926347]])

In [16]:
cv_test[0][0]

array([2439, 2363, 2363, 2320, 2313, 2223, 2198])

## Custom functions for calculating CV scores for point predictions and coverage.

These functions have been written to work with the output of stored in `.\ssa`

In [17]:
def split_cv_error(cv_preds, cv_test, error_func):
    n_splits = len(cv_preds)
    cv_errors = []
    
    for split in range(n_splits):
        pred_error = error_func(cv_test[split], cv_preds[split])
        cv_errors.append(pred_error)
        
    return np.array(cv_errors)

def forecast_errors_cv(cv_preds, cv_test, error_func):
    cv_test = np.array(cv_test)
    cv_preds = np.array(cv_preds)
    n_horizons = len(cv_test)    
    
    horizon_errors = []
    for h in range(n_horizons):
        split_errors = split_cv_error(cv_preds[h], cv_test[h], error_func)
        horizon_errors.append(split_errors)

    return np.array(horizon_errors)

def split_coverage(cv_test, cv_intervals):
    n_splits = len(cv_test)
    cv_errors = []
        
    for split in range(n_splits):
        val = np.asarray(cv_test[split])
        lower = cv_intervals[split].T[0]
        upper = cv_intervals[split].T[1]
                
        coverage = len(np.where((val > lower) & (val < upper))[0])
        coverage = coverage / len(val)
        
        cv_errors.append(coverage)
        
    return np.array(cv_errors)
    
    
def prediction_int_coverage_cv(cv_test, cv_intervals):
    cv_test = np.array(cv_test)
    cv_intervals = np.array(cv_intervals)
    n_horizons = len(cv_test)    
    
    horizon_coverage = []
    for h in range(n_horizons):
        split_coverages = split_coverage(cv_test[h], cv_intervals[h])
        horizon_coverage.append(split_coverages)

    return np.array(horizon_coverage)  

In [18]:
def split_cv_error_scaled(cv_preds, cv_test, y_train):
    n_splits = len(cv_preds)
    cv_errors = []
    
    for split in range(n_splits):
        pred_error = mean_absolute_scaled_error(cv_test[split], cv_preds[split], 
                                                y_train, period=7)
        
        cv_errors.append(pred_error)
        
    return np.array(cv_errors)

def forecast_errors_cv_scaled(cv_preds, cv_test, y_train):
    cv_test = np.array(cv_test)
    cv_preds = np.array(cv_preds)
    n_horizons = len(cv_test)    
    
    horizon_errors = []
    for h in range(n_horizons):
        split_errors = split_cv_error_scaled(cv_preds[h], cv_test[h], y_train)
        horizon_errors.append(split_errors)
        
    return np.array(horizon_errors)

# Symmetric MAPE

In [19]:
#CV point predictions smape
horizons = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 365]
cv_errors = forecast_errors_cv(cv_preds, cv_test, 
                               symmetric_mean_absolute_percentage_error)
df = pd.DataFrame(cv_errors)
df.columns = horizons
df.describe()

  if sys.path[0] == '':
  del sys.path[0]


Unnamed: 0,7,14,21,28,35,42,49,56,63,70,77,84,365
count,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0
mean,3.72541,3.96066,4.164466,4.292791,4.397097,4.465015,4.519785,4.588981,4.655683,4.729382,4.804787,4.886505,5.377868
std,1.568147,1.443953,1.379994,1.382847,1.404128,1.440922,1.443909,1.411132,1.354075,1.268173,1.191394,1.149706,0.475045
min,1.595326,1.927548,2.228979,2.187023,2.124894,2.448548,2.591903,2.682713,2.786986,2.919733,2.912273,2.935712,4.891125
25%,2.642173,2.795661,3.050193,3.028766,3.084837,3.176356,3.236244,3.610493,3.475287,3.695778,3.913295,3.840616,5.044613
50%,3.386148,3.835094,4.130232,4.103696,4.459435,4.352537,4.208597,4.456425,4.396806,4.666418,4.902772,4.930933,5.266817
75%,4.557438,4.852336,5.342542,5.416858,5.431045,5.359962,5.480141,5.612974,5.710979,5.828236,5.862317,5.822548,5.361403
max,7.413788,7.366776,6.782167,6.95027,6.953364,7.580049,7.796957,7.270392,6.98244,6.710924,6.531414,6.615741,6.785872


In [20]:
#output sMAPE results to file
metric = 'smape'
print(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')
df.to_csv(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')

../../../results/model_selection/stage1/Trust-ssa_smape.csv


# RMSE

In [21]:
#CV point predictions rmse
cv_errors = forecast_errors_cv(cv_preds, cv_test, root_mean_squared_error)
df = pd.DataFrame(cv_errors)
df.columns = horizons
df.describe()

  if sys.path[0] == '':
  del sys.path[0]


Unnamed: 0,7,14,21,28,35,42,49,56,63,70,77,84,365
count,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0
mean,99.676125,107.774594,114.162541,118.57711,122.188705,124.999033,127.364807,129.834569,132.144175,134.654197,137.075707,139.544821,152.252874
std,45.747106,43.491708,41.936722,41.224579,40.947988,41.136901,41.050456,40.526525,39.435625,37.865134,36.667751,35.916661,12.133522
min,43.82592,47.406783,62.230486,58.84052,57.067498,69.08898,71.329174,75.013923,77.075001,81.923956,81.633492,82.36934,142.287477
25%,74.050024,79.138004,84.995686,83.207536,85.478669,87.798257,88.285987,98.069838,101.083589,107.588179,109.074489,111.877753,145.253107
50%,89.844712,101.006886,106.885286,107.501433,117.119285,129.411792,126.587878,124.799975,131.128779,134.222666,139.480754,135.403104,146.528734
75%,116.755832,122.917685,134.403082,144.615905,147.02499,147.945434,151.516922,154.989191,159.81532,158.039152,163.736717,173.101273,151.576287
max,257.647852,242.022708,219.947056,216.246161,211.325733,220.41341,221.319791,210.657112,204.333349,197.582329,194.064791,194.825658,189.195254


In [22]:
#output rmse
metric = 'rmse'
print(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')
df.to_csv(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')

../../../results/model_selection/stage1/Trust-ssa_rmse.csv


# MASE

In [23]:
#mase
cv_errors = forecast_errors_cv_scaled(cv_preds, cv_test, train[REGION])
df = pd.DataFrame(cv_errors)
df.columns = horizons
df.describe()

  
  from ipykernel import kernelapp as app


Unnamed: 0,7,14,21,28,35,42,49,56,63,70,77,84,365
count,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0
mean,1.008992,1.073372,1.12908,1.16442,1.193104,1.211986,1.22734,1.246677,1.265072,1.285481,1.306374,1.328724,1.436433
std,0.455575,0.420118,0.397445,0.393284,0.396263,0.404458,0.404507,0.395619,0.380908,0.358998,0.340463,0.330876,0.120017
min,0.417306,0.508451,0.60215,0.58847,0.5693,0.659647,0.69572,0.719112,0.739665,0.770712,0.769028,0.776051,1.31674
25%,0.727895,0.745654,0.810598,0.809199,0.821889,0.841855,0.863609,0.953944,0.945794,1.01639,1.058088,1.042499,1.355344
50%,0.930638,1.034546,1.086805,1.101291,1.202826,1.176388,1.145264,1.197343,1.195146,1.279747,1.285691,1.335821,1.40002
75%,1.193866,1.2879,1.420073,1.503568,1.486838,1.487416,1.499175,1.525981,1.523275,1.560368,1.578043,1.617122,1.434246
max,2.291219,2.219709,1.940758,1.97419,1.961463,2.152005,2.193587,2.040114,1.950708,1.869115,1.813198,1.830568,1.7997


In [24]:
#output rmse
metric = 'mase'
print(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')
df.to_csv(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')

../../../results/model_selection/stage1/Trust-ssa_mase.csv


# 80% Prediction Intervals Coverage

In [25]:
#PIs
cv_coverage = prediction_int_coverage_cv(cv_test, cv_intervals)
df = pd.DataFrame(cv_coverage)
df.columns = horizons
df.describe()



Unnamed: 0,7,14,21,28,35,42,49,56,63,70,77,84,365
count,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0
mean,0.698413,0.674603,0.66843,0.662698,0.661376,0.664021,0.661376,0.653439,0.646091,0.637037,0.628187,0.618607,0.591476
std,0.266771,0.249279,0.243342,0.252991,0.256373,0.256295,0.250413,0.242543,0.236331,0.22658,0.216301,0.210914,0.106148
min,0.142857,0.142857,0.142857,0.107143,0.085714,0.095238,0.122449,0.107143,0.095238,0.128571,0.116883,0.119048,0.320548
25%,0.428571,0.5,0.547619,0.5,0.514286,0.52381,0.55102,0.5625,0.563492,0.55,0.551948,0.547619,0.558904
50%,0.714286,0.714286,0.714286,0.714286,0.771429,0.761905,0.734694,0.696429,0.666667,0.657143,0.662338,0.642857,0.624658
75%,0.857143,0.892857,0.880952,0.857143,0.857143,0.857143,0.846939,0.830357,0.825397,0.807143,0.766234,0.732143,0.654795
max,1.0,1.0,0.952381,0.964286,0.971429,0.952381,0.959184,0.946429,0.952381,0.942857,0.948052,0.952381,0.753425


In [26]:
#output 95% PI coverage
metric = 'coverage_80'
print(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')
df.to_csv(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')

../../../results/model_selection/stage1/Trust-ssa_coverage_80.csv


# 95% Prediction Intervals

Note these are stored in a seperate file directory.

In [27]:
#read in file names
files = glob.glob(f'{os.getcwd()}/ssa/95_PI/*.csv')

In [28]:
cv_data = read_ssa_folds(files)

In [29]:
#run preprocessing
cv_preds, cv_intervals, cv_test = preprocess_r_output(cv_data)

In [30]:
#PIs
cv_coverage = prediction_int_coverage_cv(cv_test, cv_intervals)
df = pd.DataFrame(cv_coverage)
df.columns = horizons
df.describe()



Unnamed: 0,7,14,21,28,35,42,49,56,63,70,77,84,365
count,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0
mean,0.904762,0.907407,0.906526,0.90873,0.906878,0.905644,0.906274,0.907407,0.910053,0.910582,0.909572,0.907407,0.871436
std,0.190018,0.157011,0.145567,0.142628,0.144733,0.143235,0.14478,0.138235,0.120885,0.107534,0.097684,0.089107,0.051787
min,0.285714,0.428571,0.428571,0.392857,0.4,0.428571,0.367347,0.357143,0.428571,0.485714,0.532468,0.571429,0.712329
25%,0.857143,0.857143,0.857143,0.857143,0.857143,0.857143,0.867347,0.875,0.888889,0.885714,0.88961,0.869048,0.871233
50%,1.0,1.0,1.0,1.0,0.971429,0.97619,0.959184,0.946429,0.936508,0.942857,0.909091,0.904762,0.893151
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984127,0.985714,0.980519,0.964286,0.90137
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.920548


In [31]:
#output 95% PI coverage
metric = 'coverage_95'
print(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')
df.to_csv(f'{TOP_LEVEL}/{STAGE}/{REGION}-{METHOD}_{metric}.csv')

../../../results/model_selection/stage1/Trust-ssa_coverage_95.csv


# End