In [1]:
%cd ../..

c:\Users\tacke\OneDrive\Documents\GitHub\Modern-Time-Series-Forecasting-with-Python-2E-1


In [2]:
import os
import random
import time

random.seed(42)
import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

import copy
import warnings
from pathlib import Path

import humanize
from sklearn.preprocessing import StandardScaler
from src.forecasting.ml_forecasting import (
    FeatureConfig,
    MissingValueConfig,
    MLForecast,
    ModelConfig,
    calculate_metrics,
)
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.ts_utils import metrics_adapter, forecast_bias,mae, mase, mse
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

In [3]:
os.makedirs("imgs/chapter_10", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
from itertools import cycle
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.9,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        )
    )
    return fig

In [5]:
try:
    #Readin the missing value imputed and train test split data
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")
    val_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")

    # Combine train and val into new train
    train_df = pd.concat([train_df, val_df])
    del val_df
    test_df = pd.read_parquet(preprocessed/"selected_blocks_test_missing_imputed_feature_engg.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

### Loading the single step backtesting baselines for validation

In [6]:
try:
    baseline_aggregate_metrics_df = pd.read_pickle(output/"ml_single_step_aggregate_metrics_auto_stationary_test.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02a-Forecasting with Target Transformation(Test).ipynb in Chapter08
    </div>
    """))

In [7]:
len(train_df.LCLid.unique())

150

# Feature Definition

In [8]:
feat_config = FeatureConfig(
    date="timestamp",
    target="energy_consumption",
    continuous_features=[
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma_span_2880",
        "energy_consumption_ewma_span_336",
        "energy_consumption_ewma_span_48",
        "timestamp_Elapsed",
        "timestamp_Month_sin_1",
        "timestamp_Month_sin_2",
        "timestamp_Month_sin_3",
        "timestamp_Month_sin_4",
        "timestamp_Month_sin_5",
        "timestamp_Month_cos_1",
        "timestamp_Month_cos_2",
        "timestamp_Month_cos_3",
        "timestamp_Month_cos_4",
        "timestamp_Month_cos_5",
        "timestamp_Hour_sin_1",
        "timestamp_Hour_sin_2",
        "timestamp_Hour_sin_3",
        "timestamp_Hour_sin_4",
        "timestamp_Hour_sin_5",
        "timestamp_Hour_cos_1",
        "timestamp_Hour_cos_2",
        "timestamp_Hour_cos_3",
        "timestamp_Hour_cos_4",
        "timestamp_Hour_cos_5",
        "timestamp_Minute_sin_1",
        "timestamp_Minute_sin_2",
        "timestamp_Minute_sin_3",
        "timestamp_Minute_sin_4",
        "timestamp_Minute_sin_5",
        "timestamp_Minute_cos_1",
        "timestamp_Minute_cos_2",
        "timestamp_Minute_cos_3",
        "timestamp_Minute_cos_4",
        "timestamp_Minute_cos_5",
    ],
    categorical_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "timestamp_Month",
        "timestamp_Quarter",
        "timestamp_WeekDay",
        "timestamp_Dayofweek",
        "timestamp_Dayofyear",
        "timestamp_Hour",
        "timestamp_Minute"
    ],
    boolean_features=[
        "timestamp_Is_quarter_end",
        "timestamp_Is_quarter_start",
        "timestamp_Is_year_end",
        "timestamp_Is_year_start",
        "timestamp_Is_month_start",
    ],
    index_cols=["LCLid","timestamp"],
    exogenous_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
    ],
)


# Missing Value Handling

In [9]:
missing_value_config = MissingValueConfig(
    bfill_columns=[
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma__span_2880",
        "energy_consumption_ewma__span_336",
        "energy_consumption_ewma__span_48",
    ],
    ffill_columns=[],
    zero_fill_columns=[],
)

# Training Global ML Model

In [10]:
from src.forecasting.ml_forecasting import calculate_metrics
from src.utils import ts_utils

In [11]:
def train_model(
    model_config,
    feature_config,
    missing_config,
    train_features,
    train_target,
    test_features,
    fit_kwargs={}
):
    ml_model = MLForecast(
        model_config=model_config,
        feature_config=feature_config,
        missing_config=missing_config,
    )
    ml_model.fit(train_features, train_target, fit_kwargs=fit_kwargs)
    y_pred = ml_model.predict(test_features)
    feat_df = ml_model.feature_importance()
    return y_pred, feat_df

def evaluate_forecast(y_pred, test_target, train_target, model_config):
    metric_l = []
    for _id in tqdm(test_target.index.get_level_values(0).remove_unused_categories().categories, desc="Calculating metrics..."):
        target = test_target.xs(_id)
        _y_pred = y_pred.xs(_id)
        history = train_target.xs(_id)
        metric_l.append(
            calculate_metrics(target, _y_pred, name=model_config.name, y_train=history)
        )
    eval_metrics_df = pd.DataFrame(metric_l)
    agg_metrics = {
            "Algorithm": model_config.name,
            "MAE": ts_utils.mae(
                test_target['energy_consumption'], y_pred
            ),
            "MSE": ts_utils.mse(
                test_target['energy_consumption'], y_pred
            ),
            "meanMASE": eval_metrics_df.loc[:, "MASE"].mean(),
            "Forecast Bias": ts_utils.forecast_bias_aggregate(
                test_target['energy_consumption'], y_pred
            )
    }
    return agg_metrics, eval_metrics_df

In [12]:
metric_record = []
individual_metrics = dict()

metric_record = (
    baseline_aggregate_metrics_df.iloc[[4]]
    .to_dict(orient="records")
)

## Baseline

In [13]:
_feat_config = copy.deepcopy(feat_config)

train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)

pred_df = test_target.copy()

cat_features = set(train_features.columns).intersection(_feat_config.categorical_features)

In [14]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM Baseline",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [15]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
    )
agg_metrics, eval_metrics_df = evaluate_forecast(
    y_pred, test_target, train_target, model_config
)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.322294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8430
[LightGBM] [Info] Number of data points in the train set: 4517040, number of used features: 73
[LightGBM] [Info] Start training from score 0.195836
Time Elapsed: 47 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [16]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627


In [17]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/baseline_fimp.png")
fig.show()

## With Metadata

In [18]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "Acorn_grouped", "LCLid"]
_feat_config = FeatureConfig(**feat_conf_dict)

train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)

cat_features = set(train_features.columns).intersection(_feat_config.categorical_features)

### CountEncoder

In [19]:
from category_encoders import CountEncoder
from lightgbm import LGBMRegressor

cat_encoder = CountEncoder(cols=cat_features)

model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM+Meta (CountEncoder)",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
    encode_categorical=True,
    categorical_encoder=cat_encoder
)

In [20]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.651045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8375
[LightGBM] [Info] Number of data points in the train set: 4517040, number of used features: 77
[LightGBM] [Info] Start training from score 0.195836
Time Elapsed: 1 minute and 53 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [21]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886


In [22]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/baseline_w_meta_cnt_encoder_fimp.png")
fig.show()

### Target Encoding

In [23]:
from category_encoders import TargetEncoder
from lightgbm import LGBMRegressor

cat_encoder = TargetEncoder(cols=cat_features)

model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM+Meta  (TargetEncoder)",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
    encode_categorical=True,
    categorical_encoder=cat_encoder
)

In [24]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.639670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8605
[LightGBM] [Info] Number of data points in the train set: 4517040, number of used features: 77
[LightGBM] [Info] Start training from score 0.195836
Time Elapsed: 1 minute and 38 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [25]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886
3,GFM+Meta (TargetEncoder),0.077324,0.027615,0.959399,1.036303,98.317032


In [26]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/baseline_w_meta_tgt_encoder_fimp.png")
fig.show()

### Native LightGBM Encoding

In [27]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM+Meta  (NativeLGBM)",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [28]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features),
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.814625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8601
[LightGBM] [Info] Number of data points in the train set: 4517040, number of used features: 77
[LightGBM] [Info] Start training from score 0.195836
Time Elapsed: 56 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [None]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886
3,GFM+Meta (TargetEncoder),0.077324,0.027615,0.959399,1.036303,98.317032
4,GFM+Meta (NativeLGBM),0.0771,0.027499,0.956906,0.716768,56.116124


In [30]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/baseline_w_meta_native_lgbm_fimp.png")
fig.show()

## Hyperparameter Tuning

In [31]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "Acorn_grouped", "LCLid"]
_feat_config = FeatureConfig(**feat_conf_dict)

train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)

cat_features = set(train_features.columns).intersection(_feat_config.categorical_features)

In [32]:
best_params = {
    "num_leaves": 99,
    "objective": "regression_l1",
    "colsample_bytree": 0.9786759775515064,
    "lambda_l1": 8.160098582954642,
    "lambda_l2": 0.17840888757497253,
    "random_state": 42,
}

In [33]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params),
    name="Tuned GFM+Meta",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [34]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features)
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.344373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8601
[LightGBM] [Info] Number of data points in the train set: 4517040, number of used features: 77
[LightGBM] [Info] Start training from score 0.112000
Time Elapsed: 1 minute and 28 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [35]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886
3,GFM+Meta (TargetEncoder),0.077324,0.027615,0.959399,1.036303,98.317032
4,GFM+Meta (NativeLGBM),0.0771,0.027499,0.956906,0.716768,56.116124
5,Tuned GFM+Meta,0.070846,0.03023,0.854128,-12.208812,88.727535


In [36]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/tuned_meta_fimp.png")
fig.show()

## Partitioning

### Random

We can partition Randomly

In [37]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "LCLid", "Acorn_grouped"]
_feat_config = FeatureConfig(**feat_conf_dict)

In [38]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params, verbose=-1),
    name="Tuned GFM+Meta+Random Part",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [39]:
def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

partitions = partition(train_df.LCLid.cat.categories.tolist(), 3)

In [40]:
y_pred_l = []
feat_df_l = []
time_elapsed_l = []
for lclids in tqdm(partitions, desc="Training groups..."):
    _train_df = train_df.loc[train_df.LCLid.isin(lclids)]
    _test_df = test_df.loc[test_df.LCLid.isin(lclids)]
    train_features, train_target, train_original_target = _feat_config.get_X_y(
        _train_df, categorical=True, exogenous=False
    )
    # Loading the Validation as test
    test_features, test_target, test_original_target = _feat_config.get_X_y(
        _test_df, categorical=True, exogenous=False
    )
    cat_features = set(train_features.columns).intersection(
        _feat_config.categorical_features
    )
    _model_config = model_config.clone()
    with LogTime() as timer:
        y_pred, feat_df = train_model(
            _model_config,
            _feat_config,
            missing_value_config,
            train_features,
            train_target,
            test_features,
            fit_kwargs=dict(categorical_feature=cat_features),
        )
    y_pred_l.append(y_pred)
    feat_df_l.append(feat_df)
    time_elapsed_l.append(timer.elapsed)

Training groups...:   0%|          | 0/3 [00:00<?, ?it/s]

Time Elapsed: 29 seconds
Time Elapsed: 34 seconds
Time Elapsed: 27 seconds


In [41]:
y_pred = pd.concat(y_pred_l)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)
train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)

agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = np.sum(time_elapsed_l)
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
#pred_df = pred_df.join(y_pred)

Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [42]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886
3,GFM+Meta (TargetEncoder),0.077324,0.027615,0.959399,1.036303,98.317032
4,GFM+Meta (NativeLGBM),0.0771,0.027499,0.956906,0.716768,56.116124
5,Tuned GFM+Meta,0.070846,0.03023,0.854128,-12.208812,88.727535
6,Tuned GFM+Meta+Random Part,0.071288,0.033717,0.851427,-13.067287,91.167622


In [43]:
#Averaging feature importance across partitions (Dirty Approximation)
feat_df = feat_df_l.pop(0)
for i, d in enumerate(feat_df_l):
    feat_df = feat_df.merge(d, on="feature",suffixes=("","_{i}"))

feat_df = feat_df.set_index('feature')
feat_df["importance"] = feat_df.sum(axis=1)
feat_df = feat_df.reset_index()

feat_df = feat_df.loc[:, ["feature", "importance"]]

In [44]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Average Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/random_partition_fimp.png")
fig.show()

### Judgmental

We can partition based on ACORN Groups

In [45]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "LCLid"]
_feat_config = FeatureConfig(**feat_conf_dict)

In [46]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params, verbose=-1),
    name="Tuned GFM+Meta+ACORN Part",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [47]:
y_pred_l = []
feat_df_l = []
time_elapsed_l = []
for acn in tqdm(train_df["Acorn_grouped"].unique(), desc="Training groups..."):
    _train_df = train_df.loc[train_df.Acorn_grouped == acn]
    _test_df = test_df.loc[test_df.Acorn_grouped == acn]
    train_features, train_target, train_original_target = _feat_config.get_X_y(
        _train_df, categorical=True, exogenous=False
    )
    # Loading the Validation as test
    test_features, test_target, test_original_target = _feat_config.get_X_y(
        _test_df, categorical=True, exogenous=False
    )
    cat_features = set(train_features.columns).intersection(
        _feat_config.categorical_features
    )
    _model_config = model_config.clone()
    with LogTime() as timer:
        y_pred, feat_df = train_model(
            _model_config,
            _feat_config,
            missing_value_config,
            train_features,
            train_target,
            test_features,
            fit_kwargs=dict(categorical_feature=cat_features),
        )
    y_pred_l.append(y_pred)
    feat_df_l.append(feat_df)
    time_elapsed_l.append(timer.elapsed)

Training groups...:   0%|          | 0/3 [00:00<?, ?it/s]

Time Elapsed: 33 seconds
Time Elapsed: 30 seconds
Time Elapsed: 31 seconds


In [48]:
y_pred = pd.concat(y_pred_l)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)
train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)

agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = np.sum(time_elapsed_l)
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [49]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886
3,GFM+Meta (TargetEncoder),0.077324,0.027615,0.959399,1.036303,98.317032
4,GFM+Meta (NativeLGBM),0.0771,0.027499,0.956906,0.716768,56.116124
5,Tuned GFM+Meta,0.070846,0.03023,0.854128,-12.208812,88.727535
6,Tuned GFM+Meta+Random Part,0.071288,0.033717,0.851427,-13.067287,91.167622
7,Tuned GFM+Meta+ACORN Part,0.069925,0.029883,0.842475,-12.253253,94.760892


In [50]:
#Averaging feature importance across partitions (Dirty Approximation)
feat_df = feat_df_l.pop(0)
for i, d in enumerate(feat_df_l):
    feat_df = feat_df.merge(d, on="feature",suffixes=("","_{i}"))

feat_df = feat_df.set_index('feature')
feat_df["importance"] = feat_df.sum(axis=1)
feat_df = feat_df.reset_index()

feat_df = feat_df.loc[:, ["feature", "importance"]]

In [51]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Average Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/acorn_partition_fimp.png")
fig.show()

### Algorithmic

In [52]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "LCLid", "Acorn_grouped"]
_feat_config = FeatureConfig(**feat_conf_dict)

#### Creating Statistical Features for the different households

In [53]:
import tsfel
cfg = tsfel.get_features_by_domain("statistical")
cfg = {**cfg, **tsfel.get_features_by_domain("temporal")}

uniq_ids = train_df.LCLid.cat.categories

stat_df = []
for id_ in tqdm(uniq_ids, desc="Calculating features for all households"):
    ts = train_df.loc[train_df.LCLid==id_, "energy_consumption"]
    res = tsfel.time_series_features_extractor(cfg, ts, verbose=False)
    res['LCLid'] = id_
    stat_df.append(res)

stat_df = pd.concat(stat_df).set_index("LCLid")
del res
stat_df.head()

Calculating features for all households:   0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0_level_0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Average power,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,...,0_Peak to peak distance,0_Positive turning points,0_Root mean square,0_Signal distance,0_Skewness,0_Slope,0_Standard deviation,0_Sum absolute diff,0_Variance,0_Zero crossing rate
LCLid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAC000061,886.910345,45.158453,886.910345,2.424909,181.806905,7315.0,29260.0,0.048,0.209,2.7e-05,...,1.152,8318.0,0.155719,36646.964689,1.220428,1.276085e-07,0.094891,1275.586345,0.009004,106.0
MAC000062,2782.600473,88.538017,2782.600473,7.60793,193.866143,7315.0,29260.0,0.159,0.297,2.7e-05,...,2.725,12108.0,0.275821,36885.53967,4.480394,1.198743e-06,0.132204,2773.450059,0.017478,0.0
MAC000066,1188.012541,51.981092,1188.012541,3.248155,183.874201,7315.0,29260.0,0.057,0.174,2.7e-05,...,1.935,11370.0,0.180224,36723.148859,4.215491,1.935651e-07,0.110824,1419.159212,0.012282,0.0
MAC000086,1326.855101,45.731688,1326.855101,3.627765,187.065434,7315.0,29260.0,0.045,0.164,2.7e-05,...,1.82,11005.0,0.190464,36980.553684,4.434881,7.331446e-07,0.143676,3264.343537,0.020643,0.0
MAC000126,1370.327761,40.57347,1370.327761,3.746624,175.971707,7315.0,29260.0,0.025,0.157,2.7e-05,...,2.005673,11478.0,0.193559,36890.424891,3.795566,-2.038497e-07,0.158636,2420.108593,0.025165,1072.0


#### Clustering the different households

In [54]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from src.utils.data_utils import replace_array_in_dataframe
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding

In [55]:
# Standardizing to make distance calculation fair
X_std = replace_array_in_dataframe(stat_df, StandardScaler().fit_transform(stat_df))
#Non-Linear Dimensionality Reduction
tsne = TSNE(n_components=2, perplexity=50, learning_rate="auto", init="pca", random_state=42, metric="cosine")
X_tsne = tsne.fit_transform(X_std.values)
# Clustering reduced dimensions into 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_tsne)
cluster_df = pd.Series(kmeans.labels_, index=X_std.index)

In [56]:
plot_df = pd.DataFrame(X_tsne, columns=["dim_1", "dim_2"], index=stat_df.index).reset_index()
plot_df["clusters"] = kmeans.labels_
plot_df["clusters"] = plot_df["clusters"].astype(str)

fig = px.scatter(plot_df, x="dim_1", y="dim_2", color="clusters", symbol="clusters", hover_name="LCLid")
format_plot(fig, xlabel="Dimension 1", ylabel="Dimension 1", title=f"Clustered t-SNE", font_size=12, legends=["Cluster 1", "Cluster 2", "Cluster 3"])
# fig.write_image("imgs/chapter_10/lin_reg_fimp.png")
fig.show()

In [57]:
fig = px.scatter(plot_df, x="dim_1", y="dim_2", color="clusters", symbol="clusters", hover_name="LCLid")
format_plot(fig, xlabel="Dimension 1", ylabel="Dimension 1", title=f"Clustered t-SNE", font_size=12, legends=["Cluster 1", "Cluster 2", "Cluster 3"])
# fig.write_image("imgs/chapter_10/clusters_tsne.png")
fig.show()

#### Using the clusters to partition

In [58]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params, verbose=-1),
    name="Tuned GFM+Meta+Clustered Part",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [59]:
y_pred_l = []
feat_df_l = []
time_elapsed_l = []
for acn in tqdm(cluster_df.unique(), desc="Training groups..."):
    lclids = cluster_df[cluster_df==acn].index
    _train_df = train_df.loc[train_df.LCLid.isin(lclids)]
    _test_df = test_df.loc[test_df.LCLid.isin(lclids)]
    train_features, train_target, train_original_target = _feat_config.get_X_y(
        _train_df, categorical=True, exogenous=False
    )
    # Loading the Validation as test
    test_features, test_target, test_original_target = _feat_config.get_X_y(
        _test_df, categorical=True, exogenous=False
    )
    cat_features = set(train_features.columns).intersection(
        _feat_config.categorical_features
    )
    _model_config = model_config.clone()
    with LogTime() as timer:
        y_pred, feat_df = train_model(
            _model_config,
            _feat_config,
            missing_value_config,
            train_features,
            train_target,
            test_features,
            fit_kwargs=dict(categorical_feature=cat_features),
        )
    y_pred_l.append(y_pred)
    feat_df_l.append(feat_df)
    time_elapsed_l.append(timer.elapsed)

Training groups...:   0%|          | 0/3 [00:00<?, ?it/s]

Time Elapsed: 44 seconds
Time Elapsed: 44 seconds
Time Elapsed: 29 seconds


In [60]:
y_pred = pd.concat(y_pred_l)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)
train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)

agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = np.sum(time_elapsed_l)
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [61]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.074996,0.026787,0.913991,2.614111,
1,GFM Baseline,0.077269,0.027735,0.959029,0.715389,47.65627
2,GFM+Meta (CountEncoder),0.077226,0.027683,0.960678,0.701905,113.3886
3,GFM+Meta (TargetEncoder),0.077324,0.027615,0.959399,1.036303,98.317032
4,GFM+Meta (NativeLGBM),0.0771,0.027499,0.956906,0.716768,56.116124
5,Tuned GFM+Meta,0.070846,0.03023,0.854128,-12.208812,88.727535
6,Tuned GFM+Meta+Random Part,0.071288,0.033717,0.851427,-13.067287,91.167622
7,Tuned GFM+Meta+ACORN Part,0.069925,0.029883,0.842475,-12.253253,94.760892
8,Tuned GFM+Meta+Clustered Part,0.069661,0.029461,0.839982,-12.355522,118.986002


In [62]:
#Averaging feature importance across partitions (Dirty Approximation)
feat_df = feat_df_l.pop(0)
for i, d in enumerate(feat_df_l):
    feat_df = feat_df.merge(d, on="feature",suffixes=("","_{i}"))

feat_df = feat_df.set_index('feature')
feat_df["importance"] = feat_df.sum(axis=1)
feat_df = feat_df.reset_index()

feat_df = feat_df.loc[:, ["feature", "importance"]].sort_values("importance", ascending=False)

In [63]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Average Feature Importance - {model_config.name}", font_size=12)
# fig.write_image("imgs/chapter_10/clustered_partition_fimp.png")
fig.show()

## Summary

In [64]:
def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

In [65]:
agg_metrics = pd.DataFrame(metric_record)
agg_metrics.style.format(
    {"MAE": "{:.4f}", "MSE": "{:.4f}", "meanMASE": "{:.4f}", "Forecast Bias": "{:.2f}%"}
).highlight_min(color="lightgreen", subset=["MAE", "MSE", "meanMASE"]).apply(
    highlight_abs_min,
    props="color:black;background-color:lightgreen",
    axis=0,
    subset=["Forecast Bias"],
)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.075,0.0268,0.914,2.61%,
1,GFM Baseline,0.0773,0.0277,0.959,0.72%,47.65627
2,GFM+Meta (CountEncoder),0.0772,0.0277,0.9607,0.70%,113.3886
3,GFM+Meta (TargetEncoder),0.0773,0.0276,0.9594,1.04%,98.317032
4,GFM+Meta (NativeLGBM),0.0771,0.0275,0.9569,0.72%,56.116124
5,Tuned GFM+Meta,0.0708,0.0302,0.8541,-12.21%,88.727535
6,Tuned GFM+Meta+Random Part,0.0713,0.0337,0.8514,-13.07%,91.167622
7,Tuned GFM+Meta+ACORN Part,0.0699,0.0299,0.8425,-12.25%,94.760892
8,Tuned GFM+Meta+Clustered Part,0.0697,0.0295,0.84,-12.36%,118.986002


In [66]:
pred_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,energy_consumption,GFM Baseline,GFM+Meta (CountEncoder),GFM+Meta (TargetEncoder),GFM+Meta (NativeLGBM),Tuned GFM+Meta,Tuned GFM+Meta+ACORN Part,Tuned GFM+Meta+Clustered Part
LCLid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MAC000061,2014-02-01 00:00:00,0.066,0.052927,0.056334,0.05675,0.054677,0.05114,0.058832,0.049731
MAC000061,2014-02-01 00:30:00,0.063,0.052804,0.055773,0.054117,0.050626,0.046351,0.045576,0.057119
MAC000061,2014-02-01 01:00:00,0.04,0.04705,0.050268,0.053103,0.050363,0.049647,0.055795,0.052451
MAC000061,2014-02-01 01:30:00,0.02,0.039569,0.040256,0.041121,0.039855,0.035472,0.0382,0.042973
MAC000061,2014-02-01 02:00:00,0.018,0.036816,0.036809,0.034412,0.035293,0.023806,0.014886,0.018486


In [67]:
individual_metrics.keys()

dict_keys(['GFM Baseline', 'GFM+Meta (CountEncoder)', 'GFM+Meta  (TargetEncoder)', 'GFM+Meta  (NativeLGBM)', 'Tuned GFM+Meta', 'Tuned GFM+Meta+Random Part', 'Tuned GFM+Meta+ACORN Part', 'Tuned GFM+Meta+Clustered Part'])

# Saving the GFM Forecasts and Metrics

In [68]:
os.makedirs("data/london_smart_meters/output", exist_ok=True)
output = Path("data/london_smart_meters/output")

In [69]:
pred_df.to_pickle(output/"gfm_predictions_test_df.pkl")
joblib.dump(individual_metrics, output/"gfm_metrics_test_df.pkl")
agg_metrics.to_pickle(output/"gfm_aggregate_metrics_test.pkl")