# Evaluate performance (quality) of openstf models using Backtest Pipeline

This notebook generates a comparison of the performance of different models/hyperparameters/forecasting horizons for default data sets. Convenient to compare a new type of model to an existing model. 
This is still a preliminary version.

In [6]:
import pandas as pd
import cufflinks
cufflinks.go_offline()
import plotly.express as px

from openstef.pipeline.train_create_forecast_backtest import train_model_and_forecast_back_test
from openstef.metrics import metrics
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
from openstef_dbc.services.prediction_job import PredictionJobDataClass # TODO, import from openstef when availavle

Set the variables in order to determine what are the:
- *id's* for the standard datasets
- *id* for the new dataset (make sure it is unknown for training model on your computer)
- old model to compare ('xgb', 'lgb', 'xgb_quantile')
- new model
- hyperparameter settings for the old and new model, you can leave empty if no changes

In [2]:
PJ_ID_STD = [287, 307] #By default, only test these two for speed. Additional options: 435, 438
PJ_ID_NEW = 12345
PJ_MODEL_OLD = 'xgb'
PJ_MODEL_NEW = 'xgb_quantile'
PJ_HYPER_PARAMS_OLD = {}
PJ_HYPER_PARAMS_NEW = {}

Comparing the models, using metrics

In [10]:
def pj_model_metrics (pj_id: int, pj_model: str, pj_hyper_params: dict, data: pd.DataFrame) -> pd.DataFrame:

    """
    Retrieve the metrics related to the model and the prediction job

    Args:
        pj_id (int): prediction job id
        pj_model (str): prediction job model
        pj_hyper_params (dict): prediction job hyper parameters
        data (pd.DataFrame): dataframe as input for training model
    Returns:
        (pandas.DataFrame): metrics related to the forecasting
    """

    # prediction job
    pj=PredictionJobDataClass(id=pj_id,
        model=pj_model,
        quantiles=[0.10,0.30,0.50,0.70,0.90],
        horizon_minutes=48*60,
        resolution_minutes=15,
        lat = 1, #should become optional
        lon = 1, #should become optional
        train_components=False,
        name='TestPrediction',
        model_type_group=None, # Note, this should become optional
        hyper_params=pj_hyper_params, # Note, this should become optional
        feature_names=None, # Note, this should become optional
        forecast_type="demand", # Note, this should become optional
                  )
    
    modelspecs = ModelSpecificationDataClass(id=pj['id'], hyper_params=pj_hyper_params)

    # Perform the backtest
    forecast, model, train_data, validation_data, test_data = train_model_and_forecast_back_test(
        pj,
        modelspecs,
        data,
        training_horizons=[0.25, 47.0],
     )

    forecast = forecast.drop(columns=['pid', 'customer', 'description', 'type', 'algtype'])

    metrics_dict = {}

    for horizon in forecast.horizon.unique():
        metrics_horizon = {}
        horizon_df = forecast[forecast['horizon']==horizon]
        horizon_forecast = forecast[forecast['horizon']==horizon]['forecast']
        horizon_realised = forecast[forecast['horizon']==horizon]['realised']

        metrics_horizon['mae'] = metrics.mae(horizon_realised, horizon_forecast)
        metrics_horizon['rmse'] = metrics.rmse(horizon_realised, horizon_forecast)
        metrics_horizon['skill_score'] = metrics.skill_score(horizon_realised, horizon_forecast, horizon_realised.mean())
        metrics_horizon['r_mae_lowest'] = metrics.r_mae_lowest(horizon_realised, horizon_forecast)
        metrics_horizon['r_mae_highest'] = metrics.r_mae_highest(horizon_realised, horizon_forecast)

        metrics_horizon['percentage_quantile'] = 100 - \
                                                 (len(horizon_df["forecast"][:][horizon_df["forecast"] > horizon_df["realised"].quantile(0.95)])/len(horizon_df))*100 -\
                                                 (len(horizon_df["forecast"][:][horizon_df["forecast"] < horizon_df["realised"].quantile(0.05)])/len(horizon_df))*100
        metrics_dict[f'forecast_{horizon}h_ahead'] = metrics_horizon

    return pd.DataFrame.from_dict(metrics_dict)

In [11]:
def compare_models (type_forecast:int) -> pd.DataFrame:

    """
    Compare the new model to the older model

    Args:
        type_forecast (int): forecasting for 15min or 47h
    Returns:
        (pandas.DataFrame): difference in metrics between the models
    """

    metrics_dict = {}

    if type_forecast==0.25:
        idx_forecast_ahead=0
    else:
        idx_forecast_ahead=1

    for i in range(len(PJ_ID_STD)):
        metrics_diff = {}

        data = pd.read_csv(f'data/get_model_input_pid_{PJ_ID_STD[i]}.csv', index_col='index', parse_dates=True)
        metrics_old = pj_model_metrics(PJ_ID_STD[i], PJ_MODEL_OLD, PJ_HYPER_PARAMS_OLD, data)
        metrics_new = pj_model_metrics(PJ_ID_NEW+i, PJ_MODEL_NEW, PJ_HYPER_PARAMS_NEW, data)

        # the lower the score, the better, so diff metric should be positive if new model better
        metrics_diff['mae (old-new)'] = metrics_old.loc['mae',:][idx_forecast_ahead] - \
                              metrics_new.loc['mae',:][idx_forecast_ahead]
        metrics_diff['rmse (old-new)'] = metrics_old.loc['rmse',:][idx_forecast_ahead] - \
                              metrics_new.loc['rmse',:][idx_forecast_ahead]
        metrics_diff['r_mae_highest (old-new)'] = metrics_old.loc['r_mae_highest',:][idx_forecast_ahead] - \
                              metrics_new.loc['r_mae_highest',:][idx_forecast_ahead]
        metrics_diff['r_mae_lowest (old-new)'] = metrics_old.loc['r_mae_lowest',:][idx_forecast_ahead] - \
                              metrics_new.loc['r_mae_lowest',:][idx_forecast_ahead]
        metrics_diff['skill_score (old-new)'] = metrics_old.loc['skill_score',:][idx_forecast_ahead] - \
                              metrics_new.loc['skill_score',:][idx_forecast_ahead]
        metrics_diff['percentage_quantile_old'] = metrics_old.loc['percentage_quantile',:][idx_forecast_ahead]
        metrics_diff['percentage_quantile_new'] = metrics_new.loc['percentage_quantile',:][idx_forecast_ahead]

        metrics_dict[f'pj_id {PJ_ID_STD[i]}'] = metrics_diff

    pd.DataFrame.from_dict(metrics_dict)

    return pd.DataFrame.from_dict(metrics_dict)

In [12]:
diff_compared_metrics_025 = compare_models(type_forecast=0.25)

2022-02-09 17:52.06 [info     ] Found 194 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.01683881607499349 num_values=194 pj_id=287
2022-02-09 17:52.06 [info     ] Removed 194 NaN values         num_removed_values=194
2022-02-09 17:52.13 [info     ] Postproces in preparation of storing
2022-02-09 17:52.14 [info     ] Found 194 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.01683881607499349 num_values=194 pj_id=12345
2022-02-09 17:52.14 [info     ] Removed 194 NaN values         num_removed_values=194
2022-02-09 17:52.39 [info     ] Postproces in preparation of storing
2022-02-09 17:52.39 [info     ] Found 194 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.01683881607499349 num_values=194 pj_id=307
2022-02-09 17:52.39 [info     ] Removed 194 NaN values         num_removed_values=194
2022-02-09 

In [13]:
diff_compared_metrics_47 = compare_models(type_forecast=47.)

2022-02-09 17:56.10 [info     ] Found 194 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.01683881607499349 num_values=194 pj_id=287
2022-02-09 17:56.10 [info     ] Removed 194 NaN values         num_removed_values=194
2022-02-09 17:56.16 [info     ] Postproces in preparation of storing
2022-02-09 17:56.17 [info     ] Found 194 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.01683881607499349 num_values=194 pj_id=12345
2022-02-09 17:56.17 [info     ] Removed 194 NaN values         num_removed_values=194
2022-02-09 17:56.40 [info     ] Postproces in preparation of storing
2022-02-09 17:56.41 [info     ] Found 194 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.01683881607499349 num_values=194 pj_id=307
2022-02-09 17:56.41 [info     ] Removed 194 NaN values         num_removed_values=194
2022-02-09 

How lower the score (range from 0 to 1), the better, so the difference between the metrics (old-new) should be positive if the new model is better.
Looking at the tables and figures, we can see the difference between different metrics and the amount of data (percentage) within in the 95% quantile.

In [14]:
# Comparing the models, with a horizon of 15 min
diff_compared_metrics_025

Unnamed: 0,pj_id 287,pj_id 307,pj_id 435,pj_id 438
mae (old-new),0.039131,0.211274,0.012584,0.363365
rmse (old-new),0.06944,0.242911,0.056817,0.490405
r_mae_highest (old-new),0.004245,0.007325,0.000722,0.013522
r_mae_lowest (old-new),0.02247,-0.013807,-0.001071,0.059657
skill_score (old-new),-0.029658,-0.035162,-0.00505,-0.058405
percentage_quantile_old,89.882353,92.882353,92.056906,94.470588
percentage_quantile_new,90.647059,94.882353,92.768228,95.705882


In [15]:
# Comparing the models, with a horizon of 47 min
diff_compared_metrics_47

Unnamed: 0,pj_id 287,pj_id 307,pj_id 435,pj_id 438
mae (old-new),-0.060152,-0.127328,0.1297,-0.156898
rmse (old-new),-0.061609,-0.003049,0.228014,-0.099493
r_mae_highest (old-new),-0.006692,-0.003444,0.010714,-0.005601
r_mae_lowest (old-new),-0.060924,-0.071408,-0.051149,-0.035497
skill_score (old-new),0.04559,0.021191,-0.052045,0.025219
percentage_quantile_old,91.647059,96.058824,95.969176,96.764706
percentage_quantile_new,92.294118,100.0,98.043865,100.0


In [16]:
fig = px.bar(diff_compared_metrics_025[:5],  barmode = 'group',
             title= "Difference between metric values from the old and the new model, 15min. <br><sup>Blue dashed line at 0.1, to check whether difference between old and new model is larger than 0.1<sup>",
             labels={'index':'Metrics','value':'Difference'})
fig.add_hline(y=0.1, line_width=1, line_dash="dash", line_color="blue")

In [17]:
fig = px.bar(diff_compared_metrics_47[:5],  barmode = 'group',
             title= "Difference between metric values from the old and the new model, 47h. <br><sup>Blue dashed line at 0.1, to check whether difference between old and new model is larger than 0.1<sup>",
             labels={'index':'Metrics','value':'Difference'})
fig.add_hline(y=0.1, line_width=1, line_dash="dash", line_color="blue")

In [18]:
fig = px.bar(diff_compared_metrics_025[5:7],  barmode = 'group',
             title = "Percentage of data within the 95% confidence interval, 15min. <br><sup> Blue dashed line at 95% to check whether amount of data within confidence interval, is around the 95% <sup>",
             labels={'index':'','value':'Percentage %'}, range_y=[75,100])
fig.add_hline(y=95, line_width=1, line_dash="dash", line_color="blue")

In [19]:
fig = px.bar(diff_compared_metrics_47[5:7],  barmode = 'group',
             title = "Percentage of data within the 95% confidence interval, 47h. <br><sup>Blue dashed line at 95% to check whether amount of data within confidence interval, is around the 95%<sup>",
             labels={'index':'','value':'Percentage %'}, range_y=[75,100])
fig.add_hline(y=95, line_width=1, line_dash="dash", line_color="blue")