In [1]:
%cd ../..

c:\Users\tacke\OneDrive\Documents\GitHub\Modern-Time-Series-Forecasting-with-Python-2E-1


In [2]:
import os
import time

import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

import warnings
from functools import partial
from pathlib import Path

import humanize
from sklearn.preprocessing import StandardScaler
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.ts_utils import metrics_adapter, forecast_bias,mae, mase, mse
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
import random

random.seed(42)
tqdm.pandas()

In [3]:
os.makedirs("imgs/chapter_9", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
                font=dict(size=font_size),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                title_text=ylabel,
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            ),
            xaxis=dict(
                title_text=xlabel,
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            ),
    )
    return fig

# Reading the Test Predictions and Metrics

In [5]:
try:
    # Readin the missing value imputed and train test split data
    train_df = pd.read_parquet(
        preprocessed / "selected_blocks_train_missing_imputed_feature_engg.parquet"
    )
    train_df = train_df.loc[:, ["timestamp", "LCLid", "energy_consumption"]].set_index(
        ["timestamp", "LCLid"]
    )
    val_df = pd.read_parquet(
        preprocessed / "selected_blocks_val_missing_imputed_feature_engg.parquet"
    )
    val_df = val_df.loc[:, ["timestamp", "LCLid", "energy_consumption"]].set_index(
        ["timestamp", "LCLid"]
    )

    train_target = train_df.reset_index().set_index("timestamp")
    # Combine train and val into new train
    train_val_target = pd.concat([train_df, val_df]).reset_index().set_index("timestamp")

    del val_df, train_df
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

In [6]:
try:
    pred_test_df = pd.read_pickle(output / "ml_single_step_prediction_test_df.pkl")
    metrics_test_df = pd.read_pickle(output / "ml_single_step_metrics_test_df.pkl")
    pred_auto_stat_test_df = pd.read_pickle(
        output / "ml_single_step_prediction_auto_stationary_test_df.pkl"
    )
    metrics_auto_stat_test_df = pd.read_pickle(
        output / "ml_single_step_metrics_auto_stationary_test_df.pkl"
    )
    agg_metrics_auto_stat_test_df = pd.read_pickle(
        output / "ml_single_step_aggregate_metrics_auto_stationary_test.pkl"
    )
    pred_baselines_test_df = pd.read_pickle(output / "baseline_test_prediction_df.pkl")
    metrics_baselines_test_df = pd.read_pickle(output / "baseline_test_metrics_df.pkl")
    agg_metrics_baselines_test_df = pd.read_pickle(
        output / "baseline_test_aggregate_metrics.pkl"
    )


    pred_val_df = pd.read_pickle(output / "ml_single_step_prediction_val_df.pkl")
    metrics_val_df = pd.read_pickle(output / "ml_single_step_metrics_val_df.pkl")
    pred_auto_stat_val_df = pd.read_pickle(
        output / "ml_single_step_prediction_auto_stationary_val_df.pkl"
    )
    metrics_auto_stat_val_df = pd.read_pickle(
        output / "ml_single_step_metrics_auto_stationary_val_df.pkl"
    )
    agg_metrics_auto_stat_val_df = pd.read_pickle(
        output / "ml_single_step_aggregate_metrics_auto_stationary_val.pkl"
    )
    pred_baselines_val_df = pd.read_pickle(output / "baseline_val_prediction_df.pkl")
    metrics_baselines_val_df = pd.read_pickle(output / "baseline_val_metrics_df.pkl")
    agg_metrics_baselines_val_df = pd.read_pickle(
        output / "baseline_val_aggregate_metrics.pkl"
    )
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run all notebooks in Chapter08 and 02-Baseline Forecasts using NIXTLA.ipynb in Chapter04
    </div>
    """))

In [7]:
pred_val_df

Unnamed: 0_level_0,predictions,LCLid,Algorithm,energy_consumption
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01 00:00:00,0.131003,MAC000061,Lasso Regression,0.165
2014-01-01 00:30:00,0.114578,MAC000061,Lasso Regression,0.167
2014-01-01 01:00:00,0.121943,MAC000061,Lasso Regression,0.150
2014-01-01 01:30:00,0.112456,MAC000061,Lasso Regression,0.091
2014-01-01 02:00:00,0.073510,MAC000061,Lasso Regression,0.047
...,...,...,...,...
2014-01-31 21:30:00,0.419882,MAC005529,LightGBM,0.431
2014-01-31 22:00:00,0.421487,MAC005529,LightGBM,0.407
2014-01-31 22:30:00,0.409758,MAC005529,LightGBM,0.395
2014-01-31 23:00:00,0.358733,MAC005529,LightGBM,0.398


In [8]:
pred_val_df.head(2)

Unnamed: 0_level_0,predictions,LCLid,Algorithm,energy_consumption
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01 00:00:00,0.131003,MAC000061,Lasso Regression,0.165
2014-01-01 00:30:00,0.114578,MAC000061,Lasso Regression,0.167


In [9]:
pred_auto_stat_val_df.head(2)

Unnamed: 0_level_0,predictions,LCLid,Algorithm,energy_consumption
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01 00:00:00,0.119527,MAC000061,Lasso Regression_auto_stat,0.165
2014-01-01 00:30:00,0.105027,MAC000061,Lasso Regression_auto_stat,0.167


In [10]:
help(pd.melt)

Help on function melt in module pandas.core.reshape.melt:

melt(frame: 'DataFrame', id_vars=None, value_vars=None, var_name=None, value_name: 'Hashable' = 'value', col_level=None, ignore_index: 'bool' = True) -> 'DataFrame'
    Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
    
    This function is useful to massage a DataFrame into a format where one
    or more columns are identifier variables (`id_vars`), while all other
    columns, considered measured variables (`value_vars`), are "unpivoted" to
    the row axis, leaving just two non-identifier columns, 'variable' and
    'value'.
    
    Parameters
    ----------
    id_vars : scalar, tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
    value_vars : scalar, tuple, list, or ndarray, optional
        Column(s) to unpivot. If not specified, uses all columns that
        are not set as `id_vars`.
    var_name : scalar, default None
        Name to use for the 'va

In [11]:
pred_baselines_val_df = pred_baselines_val_df.set_index('timestamp').melt(id_vars = ['LCLid','energy_consumption'],value_vars=['AutoETS', 'TBATS'], var_name='Algorithm', value_name='predictions', ignore_index=False)
pred_baselines_test_df = pred_baselines_test_df.set_index('timestamp').melt(id_vars = ['LCLid','energy_consumption'],value_vars=['AutoETS', 'TBATS'], var_name='Algorithm', value_name='predictions', ignore_index=False)

In [12]:
pred_baselines_test_df

Unnamed: 0_level_0,LCLid,energy_consumption,Algorithm,predictions
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-02-01 00:00:00,MAC000061,0.066,AutoETS,0.056101
2014-02-01 00:30:00,MAC000061,0.063,AutoETS,0.039241
2014-02-01 01:00:00,MAC000061,0.040,AutoETS,0.024739
2014-02-01 01:30:00,MAC000061,0.020,AutoETS,0.022753
2014-02-01 02:00:00,MAC000061,0.018,AutoETS,0.023229
...,...,...,...,...
2014-02-27 21:30:00,MAC005529,0.412,TBATS,0.568211
2014-02-27 22:00:00,MAC005529,0.389,TBATS,0.539700
2014-02-27 22:30:00,MAC005529,0.414,TBATS,0.496064
2014-02-27 23:00:00,MAC005529,0.404,TBATS,0.438176


In [13]:
pred_val_df = pd.concat([pred_val_df, pred_auto_stat_val_df, pred_baselines_val_df])
pred_val_df.index.name = "timestamp"

pred_wide_val = pd.pivot(
    pred_val_df.reset_index(),
    index=["LCLid", "timestamp"],
    columns="Algorithm",
    values="predictions",
)
pred_wide_val = pred_wide_val.join(
    pred_val_df.loc[
        pred_val_df.Algorithm == "Lasso Regression", ["LCLid", "energy_consumption"]
    ]
    .reset_index()
    .set_index(["LCLid", "timestamp"])
)
pred_wide_val.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AutoETS,Lasso Regression,Lasso Regression_auto_stat,LightGBM,LightGBM_auto_stat,TBATS,XGB Random Forest,XGB Random Forest_auto_stat,energy_consumption
LCLid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MAC000061,2014-01-01 00:00:00,0.113623,0.131003,0.119527,0.113324,0.08689,0.179,0.11843,0.092483,0.165
MAC000061,2014-01-01 00:30:00,0.083686,0.114578,0.105027,0.092285,0.074833,0.179,0.106602,0.070679,0.167
MAC000061,2014-01-01 01:00:00,0.066643,0.121943,0.129575,0.098231,0.072314,0.179,0.107403,0.073081,0.15
MAC000061,2014-01-01 01:30:00,0.062717,0.112456,0.120934,0.080759,0.06852,0.179,0.101886,0.051811,0.091
MAC000061,2014-01-01 02:00:00,0.061485,0.07351,0.080307,0.059997,0.054993,0.179,0.070557,0.048696,0.047


In [14]:
pred_test_df = pd.concat([pred_test_df, pred_auto_stat_test_df, pred_baselines_test_df])
pred_test_df.index.name = "timestamp"

pred_wide_test = pd.pivot(
    pred_test_df.reset_index(),
    index=["LCLid", "timestamp"],
    columns="Algorithm",
    values="predictions",
)
pred_wide_test = pred_wide_test.join(
    pred_test_df.loc[
        pred_test_df.Algorithm == "Lasso Regression", ["LCLid", "energy_consumption"]
    ]
    .reset_index()
    .set_index(["LCLid", "timestamp"])
)
pred_wide_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AutoETS,Lasso Regression,Lasso Regression_auto_stat,LightGBM,LightGBM_auto_stat,TBATS,XGB Random Forest,XGB Random Forest_auto_stat,energy_consumption
LCLid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MAC000061,2014-02-01 00:00:00,0.056101,0.057977,0.069642,0.068784,0.078041,0.015,0.080642,0.063133,0.066
MAC000061,2014-02-01 00:30:00,0.039241,0.05288,0.05601,0.059099,0.057969,0.064,0.062398,0.060236,0.063
MAC000061,2014-02-01 01:00:00,0.024739,0.05557,0.065247,0.057262,0.058248,0.06,0.053866,0.051276,0.04
MAC000061,2014-02-01 01:30:00,0.022753,0.039004,0.045616,0.02567,0.026564,0.059,0.045861,0.031675,0.02
MAC000061,2014-02-01 02:00:00,0.023229,0.02624,0.030788,0.022769,0.018316,0.042,0.03508,0.02692,0.018


In [15]:
metrics_combined_df = pd.concat([metrics_val_df, metrics_auto_stat_val_df])
metrics_combined_df = pd.pivot(
    metrics_combined_df, index="LCLid", columns="Algorithm", values="MAE"
)
metrics_combined_df.head()

Algorithm,Lasso Regression,Lasso Regression_auto_stat,LightGBM,LightGBM_auto_stat,XGB Random Forest,XGB Random Forest_auto_stat
LCLid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MAC000061,0.033292,0.036641,0.030601,0.032544,0.031758,0.035695
MAC000062,0.069267,0.068971,0.074536,0.071494,0.067982,0.071279
MAC000066,0.04144,0.04282,0.039021,0.040884,0.03848,0.042518
MAC000086,0.126435,0.122287,0.103212,0.106092,0.114925,0.114874
MAC000126,0.065666,0.064991,0.064833,0.063282,0.065486,0.063338


# Combining Forecasts

In [16]:
from src.forecasting.ml_forecasting import calculate_metrics
from src.utils import ts_utils

In [17]:
def evaluate_ensemble(pred_wide, target_history, model, target, unique_id):
    metric_l = []
    for _id in tqdm(pred_wide.reset_index()[unique_id].unique()):
        # unique_mask = pred_wide[unique_id]==_id
        wide_df = pred_wide.xs(_id)
        test_target = wide_df.loc[:, target]
        y_pred = wide_df.loc[:, model]
        history = target_history.loc[target_history[unique_id] == _id, target]
        metric_l.append(
            calculate_metrics(test_target, y_pred, name=model, y_train=history)
        )
    eval_metrics_df = pd.DataFrame(metric_l)
    return {
        "Algorithm": model,
        "MAE": ts_utils.mae(
            pred_wide.loc[:, "energy_consumption"], pred_wide.loc[:, model]
        ),
        "MSE": ts_utils.mse(
            pred_wide.loc[:, "energy_consumption"], pred_wide.loc[:, model]
        ),
        "meanMASE": eval_metrics_df.loc[:, "MASE"].mean(),
        "Forecast Bias": ts_utils.forecast_bias_aggregate(
            pred_wide.loc[:, "energy_consumption"], pred_wide.loc[:, model]
        ),
    }


def highlight_abs_min(s, props=""):
    return np.where(s == np.nanmin(np.abs(s.values)), props, "")

In [18]:
def display_metrics(agg_metrics_l):
    _agg_metrics_df = pd.DataFrame(agg_metrics_l)
    display(
        _agg_metrics_df.style.format(
            {
                "MAE": "{:.4f}",
                "MSE": "{:.4f}",
                "meanMASE": "{:.4f}",
                "Forecast Bias": "{:.2f}%",
            }
        )
        .highlight_min(color="lightgreen", subset=["MAE", "MSE", "meanMASE"])
        .apply(
            highlight_abs_min,
            props="color:black;background-color:lightgreen",
            axis=0,
            subset=["Forecast Bias"],
        )
    )

In [19]:
ensemble_forecasts = [
    "AutoETS",
    "Lasso Regression",
    "Lasso Regression_auto_stat",
    "LightGBM",
    "LightGBM_auto_stat",
    "TBATS",
    "XGB Random Forest",
    "XGB Random Forest_auto_stat",
]

In [20]:
# Picking LightGBM which is the best single model as the baseline
agg_metrics_l = agg_metrics_auto_stat_test_df.iloc[[4]].to_dict(orient="records")

## "Best-Fit"

In [87]:
metrics_combined_df.idxmin(axis=1)

LCLid
MAC000061                      LightGBM
MAC000062             XGB Random Forest
MAC000066             XGB Random Forest
MAC000086                      LightGBM
MAC000126            LightGBM_auto_stat
                        ...            
MAC005336                      LightGBM
MAC005375             XGB Random Forest
MAC005463            LightGBM_auto_stat
MAC005521    Lasso Regression_auto_stat
MAC005529            LightGBM_auto_stat
Length: 150, dtype: object

In [21]:
# Finding the lowest metric for each LCLid
best_alg = metrics_combined_df.idxmin(axis=1)
best_alg.head()

LCLid
MAC000061              LightGBM
MAC000062     XGB Random Forest
MAC000066     XGB Random Forest
MAC000086              LightGBM
MAC000126    LightGBM_auto_stat
dtype: object

In [85]:
best_alg

LCLid
MAC000061                      LightGBM
MAC000062             XGB Random Forest
MAC000066             XGB Random Forest
MAC000086                      LightGBM
MAC000126            LightGBM_auto_stat
                        ...            
MAC005336                      LightGBM
MAC005375             XGB Random Forest
MAC005463            LightGBM_auto_stat
MAC005521    Lasso Regression_auto_stat
MAC005529            LightGBM_auto_stat
Length: 150, dtype: object

In [91]:
# Initialize two columns in the dataframe
pred_wide_test["best_fit"] = np.nan
pred_wide_test["best_fit_alg"] = ""

# Get the intersection of lcl_id values in both pred_wide_test and best_alg
common_ids = pred_wide_test.index.get_level_values(0).unique().intersection(best_alg.index)

# Iterate only over the common lcl_id values
for lcl_id in tqdm(common_ids):
    # Pick the best algorithm
    alg = best_alg[lcl_id]
    # Store the forecast in the best_fit column
    pred_wide_test.loc[lcl_id, "best_fit"] = pred_wide_test.loc[lcl_id, alg].values
    # Also store which model was chosen for traceability
    pred_wide_test.loc[lcl_id, "best_fit_alg"] = alg

  0%|          | 0/150 [00:00<?, ?it/s]

In [92]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test, train_val_target, "best_fit", "energy_consumption", "LCLid"
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'best_fit', 'MAE': 0.07390431706406639, 'MSE': 0.02663263717913953, 'meanMASE': 0.8967652251901918, 'Forecast Bias': 0.2611879689477022}


In [93]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Average and Median Ensemble

In [94]:
# ensemble_forecasts is a list of column names(forecast) we want to combine
pred_wide_test["average_ensemble"] = pred_wide_test[ensemble_forecasts].mean(axis=1)
pred_wide_test["median_ensemble"] = pred_wide_test[ensemble_forecasts].median(axis=1)

In [95]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test, train_val_target, "median_ensemble", "energy_consumption", "LCLid"
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)
agg_metric_ = evaluate_ensemble(
    pred_wide_test, train_val_target, "average_ensemble", "energy_consumption", "LCLid"
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'median_ensemble', 'MAE': 0.0754427352736554, 'MSE': 0.027425460554646595, 'meanMASE': 0.9155360903144354, 'Forecast Bias': -0.9248336979884063}


  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'average_ensemble', 'MAE': 0.08210442485295744, 'MSE': 0.02866952346013503, 'meanMASE': 0.9958526643802431, 'Forecast Bias': 2.3660564595367677}


In [96]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Greedy Optimization

In [97]:
from src.forecasting.ensembling import calculate_performance, greedy_optimization

In [98]:
objective = partial(
    calculate_performance, pred_wide=pred_wide_val, target="energy_consumption"
)

In [99]:
solution, best_score = greedy_optimization(objective, ensemble_forecasts)

Solution: ['LightGBM', 'LightGBM_auto_stat'] | Best Score: 0.07594683306959224
Solution: ['LightGBM', 'LightGBM_auto_stat', 'Lasso Regression'] | Best Score: 0.07560064096642514
Solution cannot be improved further. Stopping optimization.


In [100]:
pred_wide_test["greedy_ensemble"] = pred_wide_test[solution].mean(axis=1)

In [101]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test, train_val_target, "greedy_ensemble", "energy_consumption", "LCLid"
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'greedy_ensemble', 'MAE': 0.07327659166265987, 'MSE': 0.02493734969104408, 'meanMASE': 0.8945876879845733, 'Forecast Bias': 0.8052543616063256}


In [102]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Stochastic Hill-climbing with Validation Forecasts

In [103]:
from src.forecasting.ensembling import stochastic_hillclimbing

In [104]:
objective = partial(
    calculate_performance, pred_wide=pred_wide_val, target="energy_consumption"
)

In [105]:
solution, best_score = stochastic_hillclimbing(
    objective, ensemble_forecasts, n_iterations=10, init="best", random_state=42
)

Iteration: 0: Iteration did not improve the score. Solution: ['LightGBM'] | Best Score: 0.07710654371377555
Iteration: 1: Iteration did not improve the score. Solution: ['LightGBM'] | Best Score: 0.07710654371377555
Iteration: 2: Iteration did not improve the score. Solution: ['LightGBM'] | Best Score: 0.07710654371377555
Iteration: 3: Iteration did not improve the score. Solution: ['LightGBM'] | Best Score: 0.07710654371377555
Iteration: 4: Solution: ['LightGBM', 'Lasso Regression_auto_stat'] | Best Score: 0.07643821232368593
Iteration: 5: Iteration did not improve the score. Solution: ['LightGBM', 'Lasso Regression_auto_stat'] | Best Score: 0.07643821232368593
Iteration: 6: Iteration did not improve the score. Solution: ['LightGBM', 'Lasso Regression_auto_stat'] | Best Score: 0.07643821232368593
Iteration: 7: Iteration did not improve the score. Solution: ['LightGBM', 'Lasso Regression_auto_stat'] | Best Score: 0.07643821232368593
Iteration: 8: Iteration did not improve the score. So

In [106]:
pred_wide_test["stochastic_hillclimb__ensemble"] = pred_wide_test[solution].mean(axis=1)

In [107]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "stochastic_hillclimb__ensemble",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'stochastic_hillclimb__ensemble', 'MAE': 0.07509971219218987, 'MSE': 0.02570545894996551, 'meanMASE': 0.9203012272812384, 'Forecast Bias': 1.2056450072988123}


In [108]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Simulated Annealing with Validation Forecasts

In [109]:
from src.forecasting.ensembling import simulated_annealing

In [110]:
objective = partial(
    calculate_performance, pred_wide=pred_wide_val, target="energy_consumption"
)

In [111]:
solution, best_score = simulated_annealing(
    objective,
    ensemble_forecasts,
    p_range=(0.5, 0.0001),
    n_iterations=50,
    init="best",
    temperature_decay="geometric",
    random_state=42,
)

Finding optimum temperature range


  0%|          | 0/100 [00:00<?, ?it/s]

Iteration: 0: Solution: ['LightGBM', 'LightGBM_auto_stat'] | Best Score: 0.07594683306959224
Iteration: 1: Solution: ['LightGBM', 'LightGBM_auto_stat', 'XGB Random Forest'] | Best Score: 0.07596037230411812
Iteration: 2: Solution: ['LightGBM', 'LightGBM_auto_stat', 'XGB Random Forest', 'XGB Random Forest_auto_stat'] | Best Score: 0.07671772971957826
Iteration: 3: Iteration did not improve the score. Solution: ['LightGBM', 'LightGBM_auto_stat', 'XGB Random Forest', 'XGB Random Forest_auto_stat'] | Best Score: 0.07671772971957826
Iteration: 4: Iteration did not improve the score. Solution: ['LightGBM', 'LightGBM_auto_stat', 'XGB Random Forest', 'XGB Random Forest_auto_stat'] | Best Score: 0.07671772971957826
Iteration: 5: Solution: ['LightGBM', 'LightGBM_auto_stat', 'XGB Random Forest', 'XGB Random Forest_auto_stat', 'Lasso Regression_auto_stat'] | Best Score: 0.07641013990755609
Iteration: 6: Iteration did not improve the score. Solution: ['LightGBM', 'LightGBM_auto_stat', 'XGB Random F

In [112]:
pred_wide_test["simulated_annealing_ensemble"] = pred_wide_test[solution].mean(axis=1)

In [113]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "simulated_annealing_ensemble",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'simulated_annealing_ensemble', 'MAE': 0.07399715104394704, 'MSE': 0.025210299344341623, 'meanMASE': 0.905629942031932, 'Forecast Bias': -0.43037081293861956}


In [114]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Optimal Weighted Ensemble

In [115]:
from src.forecasting.ensembling import find_optimal_combination

In [116]:
optimal_weights = find_optimal_combination(
    ensemble_forecasts, pred_wide_val, target="energy_consumption"
)

In [117]:
pd.DataFrame({"Forecast": ensemble_forecasts, "Weights": optimal_weights}).round(
    4
).sort_values("Weights", ascending=False)

Unnamed: 0,Forecast,Weights
3,LightGBM,0.4477
4,LightGBM_auto_stat,0.2789
2,Lasso Regression_auto_stat,0.1372
1,Lasso Regression,0.0861
6,XGB Random Forest,0.0502
0,AutoETS,0.0
5,TBATS,0.0
7,XGB Random Forest_auto_stat,0.0


In [118]:
pred_wide_test["optimal_combination_ensemble"] = np.sum(
    pred_wide_test[ensemble_forecasts].values * np.array(optimal_weights), axis=1
)

In [119]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "optimal_combination_ensemble",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'optimal_combination_ensemble', 'MAE': 0.07304722993078876, 'MSE': 0.024661927897451738, 'meanMASE': 0.8939961158248327, 'Forecast Bias': 0.8576388899422399}


In [120]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Stacking/Blending Model

In [121]:
from sklearn.linear_model import (
    HuberRegressor,
    LassoCV,
    LinearRegression,
    RidgeCV
)

### Linear Regression

In [122]:
stacking_model = LinearRegression(positive=True, fit_intercept=False)
stacking_model.fit(
    pred_wide_val[ensemble_forecasts], pred_wide_val["energy_consumption"]
)

In [123]:
pd.DataFrame({"Forecast": ensemble_forecasts, "Weights": stacking_model.coef_}).round(
    4
).sort_values("Weights", ascending=False)

Unnamed: 0,Forecast,Weights
3,LightGBM,0.4171
2,Lasso Regression_auto_stat,0.2714
1,Lasso Regression,0.2164
4,LightGBM_auto_stat,0.1278
0,AutoETS,0.0
5,TBATS,0.0
6,XGB Random Forest,0.0
7,XGB Random Forest_auto_stat,0.0


In [124]:
pred_wide_test["linear_reg_blending"] = stacking_model.predict(
    pred_wide_test[ensemble_forecasts]
)

In [125]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "linear_reg_blending",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'linear_reg_blending', 'MAE': 0.07548601559252129, 'MSE': 0.024466282247897257, 'meanMASE': 0.9255581907625409, 'Forecast Bias': 4.35782150639159}


In [126]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


### Ridge Regression

In [127]:
stacking_model = RidgeCV()
stacking_model.fit(
    pred_wide_val[ensemble_forecasts], pred_wide_val["energy_consumption"]
)

In [128]:
pd.DataFrame({"Forecast": ensemble_forecasts, "Weights": stacking_model.coef_}).round(
    4
).sort_values("Weights", ascending=False)

Unnamed: 0,Forecast,Weights
3,LightGBM,0.4757
1,Lasso Regression,0.3606
4,LightGBM_auto_stat,0.2618
2,Lasso Regression_auto_stat,0.2139
5,TBATS,0.0001
0,AutoETS,-0.0491
7,XGB Random Forest_auto_stat,-0.0787
6,XGB Random Forest,-0.1658


In [129]:
pred_wide_test["ridge_reg_blending"] = stacking_model.predict(
    pred_wide_test[ensemble_forecasts]
)

In [130]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "ridge_reg_blending",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'ridge_reg_blending', 'MAE': 0.07351889599695442, 'MSE': 0.02423381029684452, 'meanMASE': 0.9053411093968092, 'Forecast Bias': 1.8278486832664775}


In [131]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


In [132]:
# ts_utils.mae(pred_wide_val['energy_consumption'], stacking_model.predict(pred_wide_val[ensemble_forecasts]))

# ts_utils.mae(pred_wide_test['energy_consumption'], stacking_model.predict(pred_wide_test[ensemble_forecasts]))

### Lasso Regression

In [133]:
stacking_model = LassoCV()
stacking_model.fit(
    pred_wide_val[ensemble_forecasts], pred_wide_val["energy_consumption"]
)

In [134]:
pd.DataFrame({"Forecast": ensemble_forecasts, "Weights": stacking_model.coef_}).round(
    4
).sort_values("Weights", ascending=False)

Unnamed: 0,Forecast,Weights
3,LightGBM,0.4561
1,Lasso Regression,0.3251
2,Lasso Regression_auto_stat,0.2263
4,LightGBM_auto_stat,0.2248
5,TBATS,-0.0
0,AutoETS,-0.042
7,XGB Random Forest_auto_stat,-0.0539
6,XGB Random Forest,-0.1133


In [135]:
pred_wide_test["lasso_reg_blending"] = stacking_model.predict(
    pred_wide_test[ensemble_forecasts]
)

In [136]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "lasso_reg_blending",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'lasso_reg_blending', 'MAE': 0.07353886619816896, 'MSE': 0.024236818442988643, 'meanMASE': 0.9059530500565475, 'Forecast Bias': 1.9331469730905373}


In [137]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


### Huber Regression

To optimize MAE, we can also use Huber Regressor which uses Huber Loss

In [138]:
stacking_model = HuberRegressor()
stacking_model.fit(
    pred_wide_val[ensemble_forecasts], pred_wide_val["energy_consumption"]
)

In [139]:
pd.DataFrame({"Forecast": ensemble_forecasts, "Weights": stacking_model.coef_}).round(
    4
).sort_values("Weights", ascending=False)

Unnamed: 0,Forecast,Weights
3,LightGBM,0.4244
4,LightGBM_auto_stat,0.2838
1,Lasso Regression,0.1919
2,Lasso Regression_auto_stat,0.1405
6,XGB Random Forest,0.104
5,TBATS,-0.0017
0,AutoETS,-0.0682
7,XGB Random Forest_auto_stat,-0.109


In [140]:
pred_wide_test["huber_reg_blending"] = stacking_model.predict(
    pred_wide_test[ensemble_forecasts]
)

In [141]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "huber_reg_blending",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'huber_reg_blending', 'MAE': 0.07027530746677302, 'MSE': 0.02456886272943696, 'meanMASE': 0.8936010281666488, 'Forecast Bias': -6.401928328734429}


In [142]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%


## Bonus: Regularization through Variety

### Correlation as Variety

In [143]:
from src.utils.plotting_utils import plot_correlation_plot

In [144]:
fig = plot_correlation_plot(
    pred_wide_val[ensemble_forecasts].corr(),
    title="Correlation of the forecasts",
    figsize=(800, 800),
)
fig.write_image("imgs/chapter_9/correlation.png")
fig.show()

### Using Variety as Regularization

In [145]:
from src.forecasting.ensembling import calculate_diversity

In [146]:
def calculate_diverse_objective(ens, pred_wide, target, diversity_matrix, alpha):
    perf = calculate_performance(ens, pred_wide, target)
    div = calculate_diversity(ens, diversity_matrix)
    return perf + alpha * div

In [147]:
objective = partial(
    calculate_diverse_objective,
    pred_wide=pred_wide_val,
    target="energy_consumption",
    diversity_matrix=pred_wide_val[ensemble_forecasts].corr(),
    alpha=0.05,
)

In [148]:
solution, best_score = stochastic_hillclimbing(
    objective, ensemble_forecasts, n_iterations=10, random_state=42
)

Iteration: 0: Solution: ['LightGBM', 'XGB Random Forest'] | Best Score: 0.1264069931887088
Iteration: 1: Iteration did not improve the score. Solution: ['LightGBM', 'XGB Random Forest'] | Best Score: 0.1264069931887088
Iteration: 2: Iteration did not improve the score. Solution: ['LightGBM', 'XGB Random Forest'] | Best Score: 0.1264069931887088
Iteration: 3: Solution: ['LightGBM', 'XGB Random Forest', 'XGB Random Forest_auto_stat'] | Best Score: 0.12561898495208806
Iteration: 4: Solution: ['LightGBM', 'XGB Random Forest', 'XGB Random Forest_auto_stat', 'Lasso Regression_auto_stat'] | Best Score: 0.12407126279799473
Iteration: 5: Iteration did not improve the score. Solution: ['LightGBM', 'XGB Random Forest', 'XGB Random Forest_auto_stat', 'Lasso Regression_auto_stat'] | Best Score: 0.12407126279799473
Iteration: 6: Iteration did not improve the score. Solution: ['LightGBM', 'XGB Random Forest', 'XGB Random Forest_auto_stat', 'Lasso Regression_auto_stat'] | Best Score: 0.124071262797994

In [149]:
# ts_utils.mae(pred_wide_test['energy_consumption'], pred_wide_test[solution].mean(axis=1).values)

In [150]:
pred_wide_test["hillclimbing_w_reg_ensemble"] = pred_wide_test[solution].mean(axis=1)

In [151]:
agg_metric_ = evaluate_ensemble(
    pred_wide_test,
    train_val_target,
    "hillclimbing_w_reg_ensemble",
    "energy_consumption",
    "LCLid",
)
print(agg_metric_)
agg_metrics_l.append(agg_metric_)

  0%|          | 0/150 [00:00<?, ?it/s]

{'Algorithm': 'hillclimbing_w_reg_ensemble', 'MAE': 0.07468848989291672, 'MSE': 0.025704818725043817, 'meanMASE': 0.9141177704439497, 'Forecast Bias': -0.5959692842252963}


In [152]:
display_metrics(agg_metrics_l)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,LightGBM,0.075,0.0268,0.914,2.61%
1,best_fit,0.0739,0.0266,0.8968,0.26%
2,median_ensemble,0.0754,0.0274,0.9155,-0.92%
3,average_ensemble,0.0821,0.0287,0.9959,2.37%
4,greedy_ensemble,0.0733,0.0249,0.8946,0.81%
5,stochastic_hillclimb__ensemble,0.0751,0.0257,0.9203,1.21%
6,simulated_annealing_ensemble,0.074,0.0252,0.9056,-0.43%
7,optimal_combination_ensemble,0.073,0.0247,0.894,0.86%
8,linear_reg_blending,0.0755,0.0245,0.9256,4.36%
9,ridge_reg_blending,0.0735,0.0242,0.9053,1.83%
