In [1]:
%cd ../..

c:\Users\tacke\OneDrive\Documents\GitHub\Modern-Time-Series-Forecasting-with-Python-2E-1


In [2]:
import os
import time

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

import warnings
from pathlib import Path

import humanize
import joblib
from sklearn.preprocessing import StandardScaler
from src.forecasting.ml_forecasting import (
    FeatureConfig,
    MissingValueConfig,
    MLForecast,
    ModelConfig,
    calculate_metrics,
)
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.ts_utils import metrics_adapter, forecast_bias,mae, mase, mse
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [3]:
os.makedirs("imgs/chapter_8", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title=""):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=12),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=12),
        ),
    )
    return fig

In [5]:
def highlight_abs_min(s, props=""):
    return np.where(s == np.nanmin(np.abs(s.values)), props, "")

In [6]:
try:
    # Readin the missing value imputed and train test split data
    train_df = pd.read_parquet(
        preprocessed / "selected_blocks_train_missing_imputed_feature_engg.parquet"
    )
    val_df = pd.read_parquet(
        preprocessed / "selected_blocks_val_missing_imputed_feature_engg.parquet"
    )
    # Combine train and val into new train
    train_df = pd.concat([train_df, val_df])
    del val_df
    # Reading in test
    test_df = pd.read_parquet(
        preprocessed / "selected_blocks_test_missing_imputed_feature_engg.parquet"
    )

    auto_stat_target = pd.read_parquet(
        preprocessed / "selected_blocks_train_val_auto_stat_target.parquet"
    )
    transformer_pipelines = joblib.load(
        preprocessed / "auto_transformer_pipelines_train_val.pkl"
    )

    # Joining the transformed target
    train_df = (
        train_df.set_index(["LCLid", "timestamp"]).join(auto_stat_target).reset_index()
    )
    # #Renaming energy
    # test_df.rename(columns={"energy_consumption":"energy_consumption_auto_stat"}, inplace=True)
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb, 02-Dealing with Non-Stationarity.ipynb, and 02a-Dealing with Non-Stationarity-Train+Val.ipynb in Chapter06 and Chapter07
    </div>
    """))

In [7]:
len(train_df.LCLid.unique())

150

In [8]:
try:
    baseline_metrics_df = pd.read_pickle(output / "ml_single_step_metrics_test_df.pkl")
    baseline_aggregate_metrics_df = pd.read_pickle(
        output / "ml_single_step_aggregate_metrics_test.pkl"
    )
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01a-Forecasting with ML for Test Dataset.ipynb in Chapter08
    </div>
    """))

# Feature Definition

In [9]:
feat_config = FeatureConfig(
    date="timestamp",
    target="energy_consumption_auto_stat",
    original_target="energy_consumption",
    continuous_features=[
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma_span_2880",
        "energy_consumption_ewma_span_336",
        "energy_consumption_ewma_span_48",
        "timestamp_Elapsed",
        "timestamp_Month_sin_1",
        "timestamp_Month_sin_2",
        "timestamp_Month_sin_3",
        "timestamp_Month_sin_4",
        "timestamp_Month_sin_5",
        "timestamp_Month_cos_1",
        "timestamp_Month_cos_2",
        "timestamp_Month_cos_3",
        "timestamp_Month_cos_4",
        "timestamp_Month_cos_5",
        "timestamp_Hour_sin_1",
        "timestamp_Hour_sin_2",
        "timestamp_Hour_sin_3",
        "timestamp_Hour_sin_4",
        "timestamp_Hour_sin_5",
        "timestamp_Hour_cos_1",
        "timestamp_Hour_cos_2",
        "timestamp_Hour_cos_3",
        "timestamp_Hour_cos_4",
        "timestamp_Hour_cos_5",
        "timestamp_Minute_sin_1",
        "timestamp_Minute_sin_2",
        "timestamp_Minute_sin_3",
        "timestamp_Minute_sin_4",
        "timestamp_Minute_sin_5",
        "timestamp_Minute_cos_1",
        "timestamp_Minute_cos_2",
        "timestamp_Minute_cos_3",
        "timestamp_Minute_cos_4",
        "timestamp_Minute_cos_5",
    ],
    categorical_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "timestamp_Month",
        "timestamp_Quarter",
        "timestamp_WeekDay",
        "timestamp_Dayofweek",
        "timestamp_Dayofyear",
        "timestamp_Hour",
        "timestamp_Minute",
    ],
    boolean_features=[
        "timestamp_Is_quarter_end",
        "timestamp_Is_quarter_start",
        "timestamp_Is_year_end",
        "timestamp_Is_year_start",
        "timestamp_Is_month_start",
    ],
    index_cols=["timestamp"],
    exogenous_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
    ],
)

# Missing Value Handling

In [10]:
missing_value_config = MissingValueConfig(
    bfill_columns=[
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma__span_2880",
        "energy_consumption_ewma__span_336",
        "energy_consumption_ewma__span_48",
    ],
    ffill_columns=[],
    zero_fill_columns=[],
)

# Running ML Forecast for all consumers

Running Lasso Regression, XGB Random Forest, and LightGBM

In [11]:
def evaluate_model(
    model_config,
    feature_config,
    missing_config,
    target_transformer,
    train_features,
    train_target,
    test_features,
    test_target,
    train_target_original=None,
):
    ml_model = MLForecast(
        model_config=model_config,
        feature_config=feat_config,
        missing_config=missing_value_config,
        target_transformer=target_transformer,
    )
    ml_model.fit(train_features, train_target, is_transformed=True)
    y_pred = ml_model.predict(test_features)
    feat_df = ml_model.feature_importance()
    metrics = calculate_metrics(
        test_target, y_pred, model_config.name, train_target_original
    )
    return y_pred, metrics, feat_df

In [12]:
from lightgbm import LGBMRegressor
from sklearn.linear_model import LassoCV
from xgboost import XGBRFRegressor

In [13]:
lcl_ids = sorted(train_df.LCLid.unique())
models_to_run = [
    ModelConfig(
        model=LassoCV(), name="Lasso Regression", normalize=True, fill_missing=True
    ),
    ModelConfig(
        model=XGBRFRegressor(random_state=42, max_depth=4),
        name="XGB Random Forest",
        normalize=False,
        fill_missing=False,
    ),
    ModelConfig(
        model=LGBMRegressor(random_state=42),
        name="LightGBM",
        normalize=False,
        fill_missing=False,
    ),
]

In [14]:
models_to_run

[ModelConfig(model=LassoCV(), name='Lasso Regression', normalize=True, fill_missing=True, encode_categorical=False, categorical_encoder=None),
 ModelConfig(model=XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=None, device=None,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric=None, feature_types=None, gamma=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=4, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=None, n_jobs=None,
                num_parallel_tree=None, objective='reg:squarederror',
                random_state=42, reg_alpha=None, ...), name='XGB Random Forest', normalize=False, f

In [15]:
all_preds = []
all_metrics = []
#We can parallelize this loop to run this faster
for lcl_id in tqdm(lcl_ids):
    for model_config in models_to_run:
        model_config = model_config.clone()
        X_train, y_train, y_train_orig = feat_config.get_X_y(train_df.loc[train_df.LCLid==lcl_id,:], categorical=False, exogenous=False)
        X_test, _, y_test_orig = feat_config.get_X_y(test_df.loc[test_df.LCLid==lcl_id,:], categorical=False, exogenous=False)
        transformer = transformer_pipelines[lcl_id]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            y_pred, metrics, feat_df = evaluate_model(model_config, feat_config, missing_value_config, transformer, X_train, y_train, X_test, y_test_orig, y_train_orig)
        y_pred.name = "predictions"
        y_pred = y_pred.to_frame()
        y_pred['LCLid'] = lcl_id
        y_pred['Algorithm'] = model_config.name + "_auto_stat"
        metrics["LCLid"] = lcl_id
        metrics["Algorithm"] = model_config.name + "_auto_stat"
        y_pred['energy_consumption'] = y_test_orig.values
        all_preds.append(y_pred)
        all_metrics.append(metrics)

  0%|          | 0/150 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 36576, number of used features: 59
[LightGBM] [Info] Start training from score 0.549866
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 36576, number of used features: 59
[LightGBM] [Info] Start training from score 0.572629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 36576, number of used features: 59
[LightGBM] [Info] Start tra

In [16]:
pred_df = pd.concat(all_preds)
pred_df.head()

Unnamed: 0_level_0,predictions,LCLid,Algorithm,energy_consumption
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-02-01 00:00:00,0.069642,MAC000061,Lasso Regression_auto_stat,0.066
2014-02-01 00:30:00,0.05601,MAC000061,Lasso Regression_auto_stat,0.063
2014-02-01 01:00:00,0.065247,MAC000061,Lasso Regression_auto_stat,0.04
2014-02-01 01:30:00,0.045616,MAC000061,Lasso Regression_auto_stat,0.02
2014-02-01 02:00:00,0.030788,MAC000061,Lasso Regression_auto_stat,0.018


In [17]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df.head()

Unnamed: 0,Algorithm,MAE,MSE,MASE,Forecast Bias,LCLid
0,Lasso Regression_auto_stat,0.038421,0.003256,1.101653,-1.298099,MAC000061
1,XGB Random Forest_auto_stat,0.039043,0.003365,1.119484,-0.857442,MAC000061
2,LightGBM_auto_stat,0.0359,0.003157,1.029375,3.41363,MAC000061
3,Lasso Regression_auto_stat,0.069468,0.024895,0.916107,1.095146,MAC000062
4,XGB Random Forest_auto_stat,0.072109,0.025268,0.950944,-0.976155,MAC000062


# Evaluation of ML Forecast

In [18]:
from src.utils import ts_utils

In [19]:
baseline_aggregate_metrics_df

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,Naive,0.085661,0.044766,1.049933,0.017577
1,Seasonal Naive,0.1217,0.071501,1.486788,4.066731
2,Lasso Regression,0.077008,0.025566,0.945963,0.993247
3,XGB Random Forest,0.077147,0.029565,0.928695,-0.757254
4,LightGBM,0.074996,0.026787,0.913991,2.614111


In [20]:
metrics = baseline_aggregate_metrics_df.to_dict(orient="records")

In [21]:
for model in models_to_run:
    pred_mask = pred_df.Algorithm == model.name + "_auto_stat"
    metric_mask = metrics_df.Algorithm == model.name + "_auto_stat"
    metrics.append(
        {
            "Algorithm": model.name + "_auto_stat",
            "MAE": ts_utils.mae(
                pred_df.loc[pred_mask, "energy_consumption"],
                pred_df.loc[pred_mask, "predictions"],
            ),
            "MSE": ts_utils.mse(
                pred_df.loc[pred_mask, "energy_consumption"],
                pred_df.loc[pred_mask, "predictions"],
            ),
            "meanMASE": metrics_df.loc[metric_mask, "MASE"].mean(),
            "Forecast Bias": ts_utils.forecast_bias_aggregate(
                pred_df.loc[pred_mask, "energy_consumption"],
                pred_df.loc[pred_mask, "predictions"],
            ),
        }
    )

In [22]:
agg_metrics_df = pd.DataFrame(metrics)
agg_metrics_df.style.format(
    {"MAE": "{:.3f}", "MSE": "{:.3f}", "meanMASE": "{:.3f}", "Forecast Bias": "{:.2f}%"}
).highlight_min(color="lightgreen", subset=["MAE", "MSE", "meanMASE"]).apply(
    highlight_abs_min,
    props="color:black;background-color:lightgreen",
    axis=0,
    subset=["Forecast Bias"],
)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,Naive,0.086,0.045,1.05,0.02%
1,Seasonal Naive,0.122,0.072,1.487,4.07%
2,Lasso Regression,0.077,0.026,0.946,0.99%
3,XGB Random Forest,0.077,0.03,0.929,-0.76%
4,LightGBM,0.075,0.027,0.914,2.61%
5,Lasso Regression_auto_stat,0.083,0.037,1.01,-0.20%
6,XGB Random Forest_auto_stat,0.085,0.036,1.031,-4.04%
7,LightGBM_auto_stat,0.076,0.027,0.931,-1.19%


In [23]:
fig = px.histogram(
    metrics_df,
    x="MASE",
    color="Algorithm",
    pattern_shape="Algorithm",
    marginal="box",
    nbins=500,
    barmode="overlay",
    histnorm="probability density",
)
fig = format_plot(
    fig,
    xlabel="MASE",
    ylabel="Probability Density",
    title="Distribution of MASE in the dataset",
)
fig.update_layout(xaxis_range=[0, 2.5])
# fig.write_image("imgs/chapter_8/mase_dist.png")
fig.show()

In [24]:
fig = px.histogram(
    metrics_df,
    x="MAE",
    color="Algorithm",
    pattern_shape="Algorithm",
    marginal="box",
    nbins=100,
    barmode="overlay",
    histnorm="probability density",
)
fig = format_plot(
    fig,
    xlabel="MAE",
    ylabel="Probability Density",
    title="Distribution of MAE in the dataset",
)
# fig.write_image("imgs/chapter_8/mae_dist.png")
fig.update_layout(xaxis_range=[0, 0.4])
fig.show()

In [25]:
fig = px.histogram(
    metrics_df,
    x="MSE",
    color="Algorithm",
    pattern_shape="Algorithm",
    marginal="box",
    nbins=500,
    barmode="overlay",
    histnorm="probability density",
)
fig = format_plot(
    fig,
    xlabel="MSE",
    ylabel="Probability Density",
    title="Distribution of MSE in the dataset",
)
fig.update_layout(xaxis_range=[0, 0.3])
# fig.write_image("imgs/chapter_8/mse_dist.png")
fig.show()

In [26]:
fig = px.histogram(
    metrics_df,
    x="Forecast Bias",
    color="Algorithm",
    pattern_shape="Algorithm",
    marginal="box",
    nbins=250,
    barmode="overlay",
    histnorm="probability density",
)
fig = format_plot(
    fig,
    xlabel="Forecast Bias",
    ylabel="Probability Density",
    title="Distribution of Forecast Bias in the dataset",
)
fig.update_layout(xaxis_range=[-50, 30])
# fig.write_image("imgs/chapter_8/bias_dist.png")
fig.show()

# Saving the Baseline Forecasts and Metrics

In [27]:
os.makedirs("data/london_smart_meters/output", exist_ok=True)
output = Path("data/london_smart_meters/output")

In [28]:
pred_df.to_pickle(output / "ml_single_step_prediction_auto_stationary_test_df.pkl")
metrics_df.to_pickle(output / "ml_single_step_metrics_auto_stationary_test_df.pkl")
agg_metrics_df.to_pickle(
    output / "ml_single_step_aggregate_metrics_auto_stationary_test.pkl"
)