In [1]:
import time
from statsmodels.tsa.vector_ar.var_model import VAR
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt


import warnings
warnings.filterwarnings("ignore")

In [2]:
def restore_differenced(periods, df_original, df_differences):
    '''
    from: https://stackoverflow.com/questions/34918013/undo-a-series-diff
    '''
    restored = df_original.copy()
    restored.iloc[periods:] = np.nan
    for d, val in df_differences.iloc[periods:].iterrows():
        restored.iloc[d] = restored.iloc[d - periods] + val
    return restored

In [15]:
def log_score_function(actual, predicted, var):
    '''
    Function that calculates the log scoring function:
    
    L = 0.5 * sum_j (ln(var_j) + (y - y_hat)^2 / var_j)
    
    '''
    var[var == 0] = np.nan
    L = 0.5 * np.nansum(np.log(var) + (actual - predicted)**2 / var, axis=1)
    return L

In [4]:
n_sys_VAR = 400
n_data_VAR = 50000

In [5]:
data =  pd.read_csv('../../../Data/pv_power_df_5day_capacity_scaled.csv', index_col='datetime').drop(columns=['2657', '2828']) #DROPPING FAULTY SYSTEMS
uk_pv = pd.read_csv('../../../Data/system_metadata_location_rounded.csv')
uk_pv['ss_id_string'] = uk_pv['ss_id'].astype('str')
data_multiple = data.iloc[:, :n_sys_VAR][:n_data_VAR]
capacities = uk_pv[uk_pv.ss_id_string.isin(data_multiple.reset_index().columns)].set_index('ss_id_string')['kwp'].values * 1000


# Do the same for other benchmarks

In [None]:
models_list = ['persistence','yesterday','hourly_average' , 'VAR', 'SimpleExpSmoothing', 'ExponentialSmoothing',]

MAE_results = dict.fromkeys(models_list)
NLPD_results = dict.fromkeys(models_list)

for model_type in models_list:
        
    t1 = time.time()
    print(f'Getting results for {model_type}')
    #FIXED WINDOW OF 5000 train and 24 test, the 5000 train slide forward
    length_window = 97 * 10
    max_t = len(data_multiple) - length_window - 24
    errors = np.zeros((24, n_sys_VAR))

    for t in range(10000, max_t, 1000):
        data_multiple_iter = data_multiple.iloc[t:t+length_window + 24]
        data_multiple_train = data_multiple_iter.iloc[:length_window] 
        data_multiple_test = data_multiple_iter[length_window:]  

        if model_type == 'VAR':

            data_VAR = data_multiple_train.diff().diff(97).dropna()

            #CREATE MODEL AND PREDICT NEXT 24
            model = VAR(data_VAR)
            model_fit = model.fit()
            lag_order = model_fit.k_ar
            preds = model_fit.forecast(data_VAR.values[-lag_order:], 24)
            if len(preds[preds>10]) > 0:
                print('Careful, a prediction is higher than 10!')
            #evaluate forecast
            df_forecast = pd.DataFrame(preds, index=data_multiple_test.index, columns=data_VAR.columns)

            data_total = pd.concat([data_VAR,df_forecast], axis=0).reindex(data_multiple_iter.index).reset_index().drop(columns = ['datetime'])

            data_reset = data_total.iloc[1:].reset_index().drop(columns=['index'])
            restored = restore_differenced(97, data_multiple_iter.diff().dropna(), data_reset)
            restored = restored.reindex(data_multiple_iter.index).reset_index().drop(columns=['datetime'])
            restored_twice = restore_differenced(1, data_multiple_iter, restored)

            #CLIPPING PREDICTIONS BETWEEN 0 AND 1
            restored_twice = restored_twice.clip(0,1)
            predictions = restored_twice.iloc[-24:]    

        elif model_type == 'persistence':
            predictions = data_multiple_train.iloc[-1].values

        elif model_type == 'yesterday':
            predictions = np.zeros((1,n_sys_VAR))
            previous_day = data_multiple_train.iloc[-97:].values
            for i in range(24):
                pred = previous_day[-97 + i][np.newaxis, :]
                predictions = np.concatenate((predictions, pred))
            predictions = predictions[1:]

        elif model_type == 'hourly_average':
            predictions = np.zeros((1,n_sys_VAR))
            previous_hour = data_multiple_train.iloc[-12:].values
            for i in range(24):
                pred = previous_hour.mean(axis=0)[np.newaxis, :]
                predictions = np.concatenate((predictions, pred))
                #HERE I append the latest prediction and remove the oldest observation
                previous_hour = np.concatenate((previous_hour, pred), axis=0)[1:]
            predictions = predictions[1:]
            
            
        elif model_type == 'SimpleExpSmoothing':
            predictions = np.zeros((24, 1))
            variances = np.zeros((24, 1))
            for ts in range(data_multiple_train.shape[1]):
                model = SimpleExpSmoothing(data_multiple_train.iloc[:,ts], initialization_method="estimated")
                model_fit = model.fit()
                
                fcast = model_fit.forecast(24).values[:, np.newaxis]
                predictions = np.concatenate((predictions, fcast), axis=1)
                
                var = model_fit.simulate(nsimulations = 24, anchor = 'end', repetitions = 1000 ).var(axis=1).values[:, np.newaxis]
                
                var_low_bound = (fcast**2 / 4)
                var_upper_bound = ((1-fcast)**2 / 4)
                var = np.maximum(var_low_bound, var)
                var = np.minimum(var_upper_bound, var)
                
                variances = np.concatenate((variances, var), axis=1)
                
                
            predictions = predictions[:, 1:]
            variances = variances[:, 1:]
            
            nlpd = log_score_function(data_multiple_iter.iloc[-24:].values, predictions, variances) 
            
            
        elif model_type == 'ExponentialSmoothing':
            predictions = np.zeros((24, 1))
            variances = np.zeros((24, 1))
            for ts in range(data_multiple_train.shape[1]):
                model = ExponentialSmoothing(data_multiple_train.iloc[:,ts], 
                                             seasonal_periods=97,
                                             seasonal="add",
                                             initialization_method="estimated")
                model_fit = model.fit()
                fcast = model_fit.forecast(24).values[:, np.newaxis]
                predictions = np.concatenate((predictions, fcast), axis=1)
                
                var = model_fit.simulate(nsimulations = 24, anchor = 'end', repetitions = 1000 ).var(axis=1).values[:, np.newaxis]
                
                var_low_bound = (fcast**2 / 4)
                var_upper_bound = ((1-fcast)**2 / 4)
                var = np.maximum(var_low_bound, var)
                var = np.minimum(var_upper_bound, var)
                
                variances = np.concatenate((variances, var), axis=1)
                
            predictions = predictions[:, 1:]
            variances = variances[:, 1:]
            
            nlpd = log_score_function(data_multiple_iter.iloc[-24:].values, predictions, variances) 

            
        #THIS WAY WE REDUCE THE ERROR FURTHER USING A SIMPLE TRICK OF CLIPPING PREDICTIONS OUTSIDE DOMAIN
        predictions = predictions.clip(0,1)
        #Get error
        error = abs((predictions - data_multiple_iter.iloc[-24:]).values)
        errors = np.concatenate((errors, error))
        
        t2 = time.time()
        print(f'{model_type} calculation time: %2.2f secs' % (t2-t1))

    errors = errors.reshape(-1, 24, n_sys_VAR)[1:]
    MAE_hsteps = np.mean(np.mean(errors, axis=0), axis=1)    
    MAE_results[model_type] = MAE_hsteps
    
    if (model_type == 'SimpleExpSmoothing') or (model_type == 'ExponentialSmoothing'):
        NLPD_results[model_type] = nlpd


    

Getting results for persistence
persistence calculation time: 0.00 secs
persistence calculation time: 0.00 secs
persistence calculation time: 0.00 secs
persistence calculation time: 0.00 secs
persistence calculation time: 0.00 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persistence calculation time: 0.01 secs
persiste

ExponentialSmoothing calculation time: 55.56 secs
ExponentialSmoothing calculation time: 120.29 secs
ExponentialSmoothing calculation time: 185.80 secs
ExponentialSmoothing calculation time: 287.42 secs
ExponentialSmoothing calculation time: 356.38 secs
ExponentialSmoothing calculation time: 430.20 secs


In [None]:
NLPD_df = pd.DataFrame.from_dict(NLPD_results)
NLPD_df.plot()
plt.figure(figsize=(12,8))
NLPD_df.boxplot()

In [None]:
MAE_df = pd.DataFrame.from_dict(MAE_results)
MAE_df.plot()
plt.figure(figsize=(12,8))
MAE_df.boxplot()

## Uncertainty Intervals

By using a state space formulation, we can perform simulations of future values. The mathematical details are described in Hyndman and Athanasopoulos [2] and in the documentation of HoltWintersResults.simulate.

Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles and practice, 2nd edition. OTexts, 2018.

In [None]:
pred = model_fit.forecast(24).values.clip(0,1)
two_std = (model_fit.simulate(nsimulations = 24, anchor = 'end', repetitions = 1000 ).std(axis=1) * 2).values
var = model_fit.simulate(nsimulations = 24, anchor = 'end', repetitions = 1000 ).var(axis=1).values
upper_conf = (pred + two_std).clip(0,1)
lower_conf = (pred - two_std).clip(0,1)

plt.plot(pred)
plt.plot(lower_conf)
plt.plot(upper_conf)

# INCREASE THE WINDOW

In [None]:
models_list = ['persistence','yesterday','hourly_average' , 'VAR', 'SimpleExpSmoothing', 'ExponentialSmoothing',]

MAE_results = dict.fromkeys(models_list)

for model_type in models_list:
    print(f'Getting results for {model_type}')
    #FIXED WINDOW OF 5000 train and 24 test, the 5000 train slide forward
    length_window = 97 * 30
    max_t = len(data_multiple) - length_window - 24
    errors = np.zeros((24, n_sys_VAR))

    for t in range(0, max_t, 5000):
        data_multiple_iter = data_multiple.iloc[t:t+length_window + 24]
        data_multiple_train = data_multiple_iter.iloc[:length_window] 
        data_multiple_test = data_multiple_iter[length_window:]  

        if model_type == 'VAR':

            data_VAR = data_multiple_train.diff().diff(97).dropna()

            #CREATE MODEL AND PREDICT NEXT 24
            model = VAR(data_VAR)
            model_fit = model.fit()
            lag_order = model_fit.k_ar
            preds = model_fit.forecast(data_VAR.values[-lag_order:], 24)
            if len(preds[preds>10]) > 0:
                print('Careful, a prediction is higher than 10!')
            #evaluate forecast
            df_forecast = pd.DataFrame(preds, index=data_multiple_test.index, columns=data_VAR.columns)

            data_total = pd.concat([data_VAR,df_forecast], axis=0).reindex(data_multiple_iter.index).reset_index().drop(columns = ['datetime'])

            data_reset = data_total.iloc[1:].reset_index().drop(columns=['index'])
            restored = restore_differenced(97, data_multiple_iter.diff().dropna(), data_reset)
            restored = restored.reindex(data_multiple_iter.index).reset_index().drop(columns=['datetime'])
            restored_twice = restore_differenced(1, data_multiple_iter, restored)

            #CLIPPING PREDICTIONS BETWEEN 0 AND 1
            restored_twice = restored_twice.clip(0,1)
            predictions = restored_twice.iloc[-24:]    

        elif model_type == 'persistence':
            predictions = data_multiple_train.iloc[-1].values

        elif model_type == 'yesterday':
            predictions = np.zeros((1,n_sys_VAR))
            previous_day = data_multiple_train.iloc[-97:].values
            for i in range(24):
                pred = previous_day[-97 + i][np.newaxis, :]
                predictions = np.concatenate((predictions, pred))
            predictions = predictions[1:]

        elif model_type == 'hourly_average':
            predictions = np.zeros((1,n_sys_VAR))
            previous_hour = data_multiple_train.iloc[-12:].values
            for i in range(24):
                pred = previous_hour.mean(axis=0)[np.newaxis, :]
                predictions = np.concatenate((predictions, pred))
                #HERE I append the latest prediction and remove the oldest observation
                previous_hour = np.concatenate((previous_hour, pred), axis=0)[1:]
            predictions = predictions[1:]
            
        elif model_type == 'SimpleExpSmoothing':
            predictions = np.zeros((24, 1))
            for ts in range(data_multiple_train.shape[1]):
                model = SimpleExpSmoothing(data_multiple_train.iloc[:,ts], initialization_method="estimated")
                model_fit = model.fit()
                fcast = model_fit.forecast(24).values[:, np.newaxis]
                predictions = np.concatenate((predictions, fcast), axis=1)
            predictions = predictions[:, 1:]
            
        elif model_type == 'ExponentialSmoothing':
            predictions = np.zeros((24, 1))
            for ts in range(data_multiple_train.shape[1]):
                model = ExponentialSmoothing(data_multiple_train.iloc[:,ts], 
                                             seasonal_periods=97,
                                             seasonal="add",
                                             initialization_method="estimated")
                model_fit = model.fit()
                fcast = model_fit.forecast(24).values[:, np.newaxis]
                predictions = np.concatenate((predictions, fcast), axis=1)
            predictions = predictions[:, 1:]

        #Get error
        error = abs((predictions - data_multiple_iter.iloc[-24:]).values)
        errors = np.concatenate((errors, error))

    errors = errors.reshape(-1, 24, n_sys_VAR)[1:]
    MAE_hsteps = np.mean(np.mean(errors, axis=0)* capacities, axis=1)    
    MAE_results[model_type] = MAE_hsteps


In [None]:
MAE_df = pd.DataFrame.from_dict(MAE_results)
MAE_df.plot()
plt.figure(figsize=(12,8))
MAE_df.boxplot()

In [None]:
MAE_df