In [1]:
%cd ../..

c:\Users\tacke\OneDrive\Documents\GitHub\Modern-Time-Series-Forecasting-with-Python-2E-1


In [2]:
import os
import random
import time

random.seed(42)
import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

import copy
import warnings
from pathlib import Path

import humanize
from sklearn.preprocessing import StandardScaler
from src.forecasting.ml_forecasting import (
    FeatureConfig,
    MissingValueConfig,
    MLForecast,
    ModelConfig,
    calculate_metrics,
)
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.ts_utils import metrics_adapter, forecast_bias,mae, mase, mse
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

In [3]:
os.makedirs("imgs/chapter_10", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
from itertools import cycle
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.9,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        )
    )
    return fig

In [5]:
try:
    #Readin the missing value imputed and train test split data
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")
    # Read in the Validation dataset as test_df so that we predict on it
    test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")
    # test_df = pd.read_parquet(preprocessed/"block_0-7_test_missing_imputed_feature_engg.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

### Loading the single step backtesting baselines for validation

In [6]:
try:
    baseline_aggregate_metrics_df = pd.read_pickle(output/"ml_single_step_aggregate_metrics_auto_stationary_val.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02-Forecasting with Target Transformation.ipynb in Chapter08
    </div>
    """))

In [7]:
len(train_df.LCLid.unique())

150

# Feature Definition

In [8]:
feat_config = FeatureConfig(
    date="timestamp",
    target="energy_consumption",
    continuous_features=[
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma_span_2880",
        "energy_consumption_ewma_span_336",
        "energy_consumption_ewma_span_48",
        "timestamp_Elapsed",
        "timestamp_Month_sin_1",
        "timestamp_Month_sin_2",
        "timestamp_Month_sin_3",
        "timestamp_Month_sin_4",
        "timestamp_Month_sin_5",
        "timestamp_Month_cos_1",
        "timestamp_Month_cos_2",
        "timestamp_Month_cos_3",
        "timestamp_Month_cos_4",
        "timestamp_Month_cos_5",
        "timestamp_Hour_sin_1",
        "timestamp_Hour_sin_2",
        "timestamp_Hour_sin_3",
        "timestamp_Hour_sin_4",
        "timestamp_Hour_sin_5",
        "timestamp_Hour_cos_1",
        "timestamp_Hour_cos_2",
        "timestamp_Hour_cos_3",
        "timestamp_Hour_cos_4",
        "timestamp_Hour_cos_5",
        "timestamp_Minute_sin_1",
        "timestamp_Minute_sin_2",
        "timestamp_Minute_sin_3",
        "timestamp_Minute_sin_4",
        "timestamp_Minute_sin_5",
        "timestamp_Minute_cos_1",
        "timestamp_Minute_cos_2",
        "timestamp_Minute_cos_3",
        "timestamp_Minute_cos_4",
        "timestamp_Minute_cos_5",
    ],
    categorical_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "timestamp_Month",
        "timestamp_Quarter",
        "timestamp_WeekDay",
        "timestamp_Dayofweek",
        "timestamp_Dayofyear",
        "timestamp_Hour",
        "timestamp_Minute"
    ],
    boolean_features=[
        "timestamp_Is_quarter_end",
        "timestamp_Is_quarter_start",
        "timestamp_Is_year_end",
        "timestamp_Is_year_start",
        "timestamp_Is_month_start",
    ],
    index_cols=["LCLid","timestamp"],
    exogenous_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
    ],
)


# Missing Value Handling

In [9]:
missing_value_config = MissingValueConfig(
    bfill_columns=[
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma__span_2880",
        "energy_consumption_ewma__span_336",
        "energy_consumption_ewma__span_48",
    ],
    ffill_columns=[],
    zero_fill_columns=[],
)

# Training Global ML Model

In [10]:
from src.forecasting.ml_forecasting import calculate_metrics
from src.utils import ts_utils

In [11]:
def train_model(
    model_config,
    feature_config,
    missing_config,
    train_features,
    train_target,
    test_features,
    fit_kwargs={}
):
    ml_model = MLForecast(
        model_config=model_config,
        feature_config=feature_config,
        missing_config=missing_config,
    )
    ml_model.fit(train_features, train_target, fit_kwargs=fit_kwargs)
    y_pred = ml_model.predict(test_features)
    feat_df = ml_model.feature_importance()
    return y_pred, feat_df

def evaluate_forecast(y_pred, test_target, train_target, model_config):
    metric_l = []
    for _id in tqdm(test_target.index.get_level_values(0).remove_unused_categories().categories, desc="Calculating metrics..."):
        target = test_target.xs(_id)
        _y_pred = y_pred.xs(_id)
        history = train_target.xs(_id)
        metric_l.append(
            calculate_metrics(target, _y_pred, name=model_config.name, y_train=history)
        )
    eval_metrics_df = pd.DataFrame(metric_l)
    agg_metrics = {
            "Algorithm": model_config.name,
            "MAE": ts_utils.mae(
                test_target['energy_consumption'], y_pred
            ),
            "MSE": ts_utils.mse(
                test_target['energy_consumption'], y_pred
            ),
            "meanMASE": eval_metrics_df.loc[:, "MASE"].mean(),
            "Forecast Bias": ts_utils.forecast_bias_aggregate(
                test_target['energy_consumption'], y_pred
            )
    }
    return agg_metrics, eval_metrics_df

In [12]:
metric_record = []
individual_metrics = dict()

metric_record = (
    baseline_aggregate_metrics_df.iloc[[4]]
    .to_dict(orient="records")
)

## Baseline

In [13]:
_feat_config = copy.deepcopy(feat_config)

train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=False, exogenous=False
)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=False, exogenous=False
)

pred_df = test_target.copy()

In [14]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM Baseline",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [15]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
    )
agg_metrics, eval_metrics_df = evaluate_forecast(
    y_pred, test_target, train_target, model_config
)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.002769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8119
[LightGBM] [Info] Number of data points in the train set: 4293840, number of used features: 61
[LightGBM] [Info] Start training from score 0.194160
Time Elapsed: 39 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [16]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276


In [17]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/baseline_fimp.png")
fig.show()

## With Metadata

In [18]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "Acorn_grouped", "LCLid"]
_feat_config = FeatureConfig(**feat_conf_dict)

train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)

cat_features = set(train_features.columns).intersection(_feat_config.categorical_features)

### CountEncoder

In [19]:
from category_encoders import CountEncoder
from lightgbm import LGBMRegressor

cat_encoder = CountEncoder(cols=cat_features)

model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM+Meta (CountEncoder)",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
    encode_categorical=True,
    categorical_encoder=cat_encoder
)

In [20]:
_feat_config

FeatureConfig(date='timestamp', target='energy_consumption', original_target='energy_consumption', continuous_features=['visibility', 'windBearing', 'temperature', 'dewPoint', 'pressure', 'apparentTemperature', 'windSpeed', 'humidity', 'energy_consumption_lag_1', 'energy_consumption_lag_2', 'energy_consumption_lag_3', 'energy_consumption_lag_4', 'energy_consumption_lag_5', 'energy_consumption_lag_46', 'energy_consumption_lag_47', 'energy_consumption_lag_48', 'energy_consumption_lag_49', 'energy_consumption_lag_50', 'energy_consumption_lag_334', 'energy_consumption_lag_335', 'energy_consumption_lag_336', 'energy_consumption_lag_337', 'energy_consumption_lag_338', 'energy_consumption_rolling_3_mean', 'energy_consumption_rolling_3_std', 'energy_consumption_rolling_6_mean', 'energy_consumption_rolling_6_std', 'energy_consumption_rolling_12_mean', 'energy_consumption_rolling_12_std', 'energy_consumption_rolling_48_mean', 'energy_consumption_rolling_48_std', 'energy_consumption_48_seasonal_r

In [21]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.299152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8374
[LightGBM] [Info] Number of data points in the train set: 4293840, number of used features: 77
[LightGBM] [Info] Start training from score 0.194160
Time Elapsed: 1 minute and 52 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [22]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089


In [23]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/baseline_w_meta_cnt_encoder_fimp.png")
fig.show()

### Target Encoding

In [24]:
from category_encoders import TargetEncoder

In [25]:
from category_encoders import TargetEncoder
from lightgbm import LGBMRegressor

cat_encoder = TargetEncoder(cols=cat_features)

model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM+Meta  (TargetEncoder)",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
    encode_categorical=True,
    categorical_encoder=cat_encoder
)

In [26]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.835759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8609
[LightGBM] [Info] Number of data points in the train set: 4293840, number of used features: 77
[LightGBM] [Info] Start training from score 0.194160
Time Elapsed: 1 minute and 42 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [27]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089
3,GFM+Meta (TargetEncoder),0.079553,0.027241,1.011387,0.299803,102.253439


In [28]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/baseline_w_meta_tgt_encoder_fimp.png")
fig.show()

### Native LightGBM Encoding

In [29]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="GFM+Meta  (NativeLGBM)",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
    # We are using inbuilt categorical feature handling
    encode_categorical=False,
)

In [30]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features),
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.620726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8600
[LightGBM] [Info] Number of data points in the train set: 4293840, number of used features: 77
[LightGBM] [Info] Start training from score 0.194160
Time Elapsed: 1 minute and 6 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [31]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089
3,GFM+Meta (TargetEncoder),0.079553,0.027241,1.011387,0.299803,102.253439
4,GFM+Meta (NativeLGBM),0.079338,0.027199,1.011089,0.047789,66.527437


In [32]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/baseline_w_meta_native_lgbm_fimp.png")
fig.show()

## Hyperparameter Tuning

In [33]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "Acorn_grouped", "LCLid"]
_feat_config = FeatureConfig(**feat_conf_dict)

train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)

cat_features = set(train_features.columns).intersection(_feat_config.categorical_features)

<div class="alert alert-block alert-warning"><b>Important: </b> Run the following cells for Hyperparameter Tuning only if needed. <br/>The best parameters are hardcoded in the section after the tuning so that these cells need not be run everytime (Takes a lot of time).<br/> Skip to <b>Using the tuned parameters</b></div>

### Grid Search

In [34]:
from sklearn.model_selection import ParameterGrid

grid_params = {
    "num_leaves": [16, 31, 63],
    "objective": ["regression", "regression_l1", "huber"],
    "random_state": [42],
    "colsample_bytree": [0.5, 0.8, 1.0],
}
parameter_space = list(ParameterGrid(grid_params))

In [35]:
# Can use PredefinedSplit along with GridSearchCV to have the search done faster using multi-processing
# Or we can parallelize the loop ourselves
scores = []
for p in tqdm(parameter_space, desc="Performing Grid Search"):
    _model_config = ModelConfig(
        model=LGBMRegressor(**p, verbose=-1),
        name="Global Meta LightGBM Tuning",
        # LGBM is not sensitive to normalized data
        normalize=False,
        # LGBM can handle missing values
        fill_missing=False,
    )
    y_pred, feat_df = train_model(
        _model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features),
    )
    scores.append(ts_utils.mae(
                test_target['energy_consumption'], y_pred
            ))
    

Performing Grid Search:   0%|          | 0/27 [00:00<?, ?it/s]

In [36]:
grid_search_trials = pd.DataFrame({"params":parameter_space, "score":scores}).sort_values("score")
best_params_gs = grid_search_trials.iloc[0,0]
best_score_gs = grid_search_trials.iloc[0,1]
grid_search_trials.head()

Unnamed: 0,params,score
16,"{'colsample_bytree': 0.8, 'num_leaves': 63, 'o...",0.074434
25,"{'colsample_bytree': 1.0, 'num_leaves': 63, 'o...",0.074708
7,"{'colsample_bytree': 0.5, 'num_leaves': 63, 'o...",0.074765
13,"{'colsample_bytree': 0.8, 'num_leaves': 31, 'o...",0.075569
22,"{'colsample_bytree': 1.0, 'num_leaves': 31, 'o...",0.075706


### Random Search

In [37]:
import scipy
from sklearn.model_selection import ParameterSampler

random_search_params = {
    # A uniform distribution between 10 and 100, but only integers
    "num_leaves": scipy.stats.randint(10,100),
    # A list of categorical string values
    "objective": ["regression", "regression_l1", "huber"],
    "random_state": [42],
    # List of floating point numbers between 0.3 and 1.0 with a resolution of 0.05
    "colsample_bytree": np.arange(0.3,1.0,0.05),
    # List of floating point numbers between 0 and 10 with a resolution of 0.1
    "lambda_l1":np.arange(0,10,0.1),
    # List of floating point numbers between 0 and 10 with a resolution of 0.1
    "lambda_l2":np.arange(0,10,0.1)
}
# Sampling from the search space number of iterations times
parameter_space = list(ParameterSampler(random_search_params, n_iter=27, random_state=42))

In [38]:
# Can use PredefinedSplit along with RandomSearchCV to have the search done faster using multi-processing
# Or we can parallelize the loop ourselves
scores = []
for p in tqdm(parameter_space, desc="Performing Random Search"):
    _model_config = ModelConfig(
        model=LGBMRegressor(**p, verbose=-1),
        name="Global Meta LightGBM Tuning",
        # LGBM is not sensitive to normalized data
        normalize=False,
        # LGBM can handle missing values
        fill_missing=False,
    )
    y_pred, feat_df = train_model(
        _model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features),
    )
    scores.append(ts_utils.mae(
                test_target['energy_consumption'], y_pred
            ))
    

Performing Random Search:   0%|          | 0/27 [00:00<?, ?it/s]

In [39]:
random_search_trials = pd.DataFrame({"params":parameter_space, "score":scores}).sort_values("score")
best_params_rs = random_search_trials.iloc[0,0]
best_score_rs = random_search_trials.iloc[0,1]
random_search_trials.head()

Unnamed: 0,params,score
3,"{'colsample_bytree': 0.5499999999999999, 'lamb...",0.074283
20,"{'colsample_bytree': 0.8999999999999999, 'lamb...",0.075432
11,"{'colsample_bytree': 0.35, 'lambda_l1': 0.3000...",0.075581
12,"{'colsample_bytree': 0.35, 'lambda_l1': 0.8, '...",0.07564
17,"{'colsample_bytree': 0.35, 'lambda_l1': 8.9, '...",0.07609


### Bayesian Optimization

In [40]:
import optuna

In [41]:
# Define an objective functions which takes in trial as a parameter 
# and evaluates the model with the generated params
def objective(trial):
    params = {
        # Sample an integer between 10 and 100
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
        # Sample a categorical value from the list provided
        "objective": trial.suggest_categorical(
            "objective", ["regression", "regression_l1", "huber"]
        ),
        "random_state": [42],
        # Sample from a uniform distribution between 0.3 and 1.0
        "colsample_bytree": trial.suggest_float ("colsample_bytree", 0.3, 1.0),
        # Sample from a uniform distribution between 0 and 10
        "lambda_l1": trial.suggest_float ("lambda_l1", 0, 10),
        # Sample from a uniform distribution between 0 and 10
        "lambda_l2": trial.suggest_float ("lambda_l2", 0, 10),
    }
    _model_config = ModelConfig(
        # Use the sampled params to initialize the model
        model=LGBMRegressor(**params, verbose=-1),
        name="Global Meta LightGBM Tuning",
        # LGBM is not sensitive to normalized data
        normalize=False,
        # LGBM can handle missing values
        fill_missing=False,
    )
    y_pred, feat_df = train_model(
        _model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features),
    )
    # Return the MAE metric as the value
    return ts_utils.mae(test_target["energy_consumption"], y_pred)

In [42]:
# Create a sampler and set seed for repeatability. 
# Set startup trials as 5 because out total trials is lower.
sampler = optuna.samplers.TPESampler(n_startup_trials=5, seed=42)
# Create a study
study = optuna.create_study(direction="minimize", sampler=sampler)
# Start the optimization run
study.optimize(objective, n_trials=27, show_progress_bar=True)

[I 2024-10-26 08:01:58,515] A new study created in memory with name: no-name-78e3483a-0be9-43fe-ba14-2ef745b277f0


  0%|          | 0/27 [00:00<?, ?it/s]

[I 2024-10-26 08:02:41,912] Trial 0 finished with value: 0.07918144764005784 and parameters: {'num_leaves': 44, 'objective': 'regression', 'colsample_bytree': 0.40921304830970556, 'lambda_l1': 1.5599452033620265, 'lambda_l2': 0.5808361216819946}. Best is trial 0 with value: 0.07918144764005784.
[I 2024-10-26 08:04:10,903] Trial 1 finished with value: 0.07403915567918215 and parameters: {'num_leaves': 88, 'objective': 'regression_l1', 'colsample_bytree': 0.978936896513396, 'lambda_l1': 8.324426408004218, 'lambda_l2': 2.1233911067827616}. Best is trial 1 with value: 0.07403915567918215.
[I 2024-10-26 08:04:51,586] Trial 2 finished with value: 0.07938443582566357 and parameters: {'num_leaves': 26, 'objective': 'huber', 'colsample_bytree': 0.602361513049481, 'lambda_l1': 2.9122914019804194, 'lambda_l2': 6.118528947223795}. Best is trial 1 with value: 0.07403915567918215.
[I 2024-10-26 08:05:38,137] Trial 3 finished with value: 0.07954560123687912 and parameters: {'num_leaves': 22, 'objecti

In [43]:
bo_search_trials = study.trials_dataframe()
best_params_bo = study.best_params
best_score_bo = study.best_value
bo_search_trials.sort_values("value").head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_lambda_l1,params_lambda_l2,params_num_leaves,params_objective,state
12,12,0.073877,2024-10-26 08:14:47.622270,2024-10-26 08:16:12.494486,0 days 00:01:24.872216,0.858214,7.744169,1.867196,100,regression_l1,COMPLETE
11,11,0.073903,2024-10-26 08:13:19.146745,2024-10-26 08:14:47.620269,0 days 00:01:28.473524,0.947029,7.719985,0.550396,100,regression_l1,COMPLETE
22,22,0.073993,2024-10-26 08:26:22.254759,2024-10-26 08:27:49.128422,0 days 00:01:26.873663,0.982502,8.840504,0.819291,89,regression_l1,COMPLETE
26,26,0.073996,2024-10-26 08:31:47.298719,2024-10-26 08:33:17.459890,0 days 00:01:30.161171,0.996273,5.991646,1.408906,96,regression_l1,COMPLETE
21,21,0.074028,2024-10-26 08:24:50.184739,2024-10-26 08:26:22.250753,0 days 00:01:32.066014,0.987354,8.3872,2.18266,89,regression_l1,COMPLETE


In [44]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

### Hyperparameter Tuning Techniques (Comparison)

In [45]:
def plot_optimization_history(trials_df):
    plot_df = trials_df.sort_index()
    plot_df['best'] = plot_df.score.expanding().min()

    x = plot_df.reset_index().index
    fig = go.Figure(layout=dict(title="Optimization History Plot"))

    fig.add_trace(
        go.Scatter(
            x=x,
            y=plot_df.score,
            mode='markers',
            name="Objective"
        ))

    fig.add_trace(
        go.Scatter(
            x=x,
            y=plot_df.best,
            mode='lines',
            name="Best Value"
        ))

    return fig

In [46]:
fig = plot_optimization_history(grid_search_trials)
fig = format_plot(fig, xlabel="# Trials", ylabel="Objective Value", title="Optimization History Plot (Grid Search)")
fig.write_image("imgs/chapter_10/opt_history_gs.png")
fig.show()

In [47]:
fig = plot_optimization_history(random_search_trials)
fig = format_plot(fig, xlabel="# Trials", ylabel="Objective Value", title="Optimization History Plot (Random Search)")
fig.write_image("imgs/chapter_10/opt_history_rs.png")
fig.show()

In [48]:
fig = optuna.visualization.plot_optimization_history(study)
fig = format_plot(fig, xlabel="# Trials", ylabel="Objective Value", title="Optimization History Plot (Bayesian Optimization)")
fig.write_image("imgs/chapter_10/opt_history_bo.png")
fig.show()

This makes the distinction clearer. Grid Search has a pattern which indicates that it is just executing the defined trials blindly in the order it was given. Even after seeing a particular region is of low value, it still executes those trials because it is not self-aware. Random Search is pretty much what we expected. We can see that the objective values are also pretty randomly distributed and even towards the end, it is still exploring low value regions in the search space. But Bayesian Optimization can be seen to improve the objective value and focus most of its trials on regions which gives best objective value.

In [49]:
plot_df = pd.DataFrame({"Optimization": ["Grid Search", "Random Search", "Bayesian Optimization"], "Best Score": [best_score_gs, best_score_rs, best_score_bo]}).sort_values(by = ['Best Score'],
                    ascending = True)
plot_df

Unnamed: 0,Optimization,Best Score
2,Bayesian Optimization,0.073877
1,Random Search,0.074283
0,Grid Search,0.074434


In [50]:
plot_df = grid_search_trials.copy()
plot_df['optimization'] = "Grid Search"
plot_df.drop(columns="params", inplace=True)

df_ = random_search_trials.copy()
df_['optimization'] = "Random Search"
df_.drop(columns="params", inplace=True)
plot_df = pd.concat([plot_df, df_])

df_ = bo_search_trials.copy()
df_['optimization'] = "Bayesian Optimization"
df_.rename(columns={"value": "score"}, inplace=True)
df_ = df_[["score", "optimization"]]
plot_df = pd.concat([plot_df, df_])

fig = px.violin(plot_df, y="score", color="optimization",  points=False)
fig = format_plot(fig, xlabel="Optimization Techniques", ylabel="Objective Value", title="Objective Function Evaluation of Different Optimization Techniques")
fig.write_image("imgs/chapter_10/opt_violin.png")
fig.show()

We can see that the Bayesian Optimization has a fat tail on the lower side indicating that it spent most of its computational budget evaluating and exploiting optimal regions of the search space. 

<a id="using_tuned_parameters"></a>
### Using the tuned parameters

In [51]:
# best_params = study.best_params
# best_params['random_state'] = 42
# best_params

In [52]:
best_params = {
    "num_leaves": 99,
    "objective": "regression_l1",
    "colsample_bytree": 0.9786759775515064,
    "lambda_l1": 8.160098582954642,
    "lambda_l2": 0.17840888757497253,
    "random_state": 42,
}

In [53]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params),
    name="Tuned GFM+Meta",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [54]:
with LogTime() as timer:
    y_pred, feat_df = train_model(
        model_config,
        _feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        fit_kwargs=dict(categorical_feature=cat_features)
    )
agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = timer.elapsed
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.708362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8600
[LightGBM] [Info] Number of data points in the train set: 4293840, number of used features: 77
[LightGBM] [Info] Start training from score 0.111000
Time Elapsed: 1 minute and 34 seconds


Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [55]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089
3,GFM+Meta (TargetEncoder),0.079553,0.027241,1.011387,0.299803,102.253439
4,GFM+Meta (NativeLGBM),0.079338,0.027199,1.011089,0.047789,66.527437
5,Tuned GFM+Meta,0.074073,0.030878,0.91689,-12.17957,94.305064


In [56]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/tuned_meta_fimp.png")
fig.show()

## Partitioning

In [57]:
best_params = {
    "num_leaves": 99,
    "objective": "regression_l1",
    "colsample_bytree": 0.9786759775515064,
    "lambda_l1": 8.160098582954642,
    "lambda_l2": 0.17840888757497253,
    "random_state": 42,
}

### Random

We can partition Randomly

In [58]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "LCLid", "Acorn_grouped"]
_feat_config = FeatureConfig(**feat_conf_dict)

In [59]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params, verbose=-1),
    name="Tuned GFM+Meta+Random Part",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [60]:
def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

partitions = partition(train_df.LCLid.cat.categories.tolist(), 3)

In [61]:
y_pred_l = []
feat_df_l = []
time_elapsed_l = []
for lclids in tqdm(partitions, desc="Training groups..."):
    _train_df = train_df.loc[train_df.LCLid.isin(lclids)]
    _test_df = test_df.loc[test_df.LCLid.isin(lclids)]
    train_features, train_target, train_original_target = _feat_config.get_X_y(
        _train_df, categorical=True, exogenous=False
    )
    # Loading the Validation as test
    test_features, test_target, test_original_target = _feat_config.get_X_y(
        _test_df, categorical=True, exogenous=False
    )
    cat_features = set(train_features.columns).intersection(
        _feat_config.categorical_features
    )
    _model_config = model_config.clone()
    with LogTime() as timer:
        y_pred, feat_df = train_model(
            _model_config,
            _feat_config,
            missing_value_config,
            train_features,
            train_target,
            test_features,
            fit_kwargs=dict(categorical_feature=cat_features),
        )
    y_pred_l.append(y_pred)
    feat_df_l.append(feat_df)
    time_elapsed_l.append(timer.elapsed)

Training groups...:   0%|          | 0/3 [00:00<?, ?it/s]

Time Elapsed: 34 seconds
Time Elapsed: 37 seconds
Time Elapsed: 31 seconds


In [62]:
y_pred = pd.concat(y_pred_l)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)
train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)

agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = np.sum(time_elapsed_l)
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
pred_df = pred_df.join(y_pred)

Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [63]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089
3,GFM+Meta (TargetEncoder),0.079553,0.027241,1.011387,0.299803,102.253439
4,GFM+Meta (NativeLGBM),0.079338,0.027199,1.011089,0.047789,66.527437
5,Tuned GFM+Meta,0.074073,0.030878,0.91689,-12.17957,94.305064
6,Tuned GFM+Meta+Random Part,0.073132,0.030865,0.905318,-12.380182,103.310471


In [64]:
feat_df_l

[                                           feature  importance
 45                        energy_consumption_lag_1         847
 35                                           LCLid         645
 4                 energy_consumption_rolling_3_std         581
 12  energy_consumption_336_seasonal_rolling_3_mean         498
 25   energy_consumption_336_seasonal_rolling_3_std         406
 ..                                             ...         ...
 46                           timestamp_Month_cos_3           0
 14                           timestamp_Month_cos_2           0
 39                           timestamp_Is_year_end           0
 17                      timestamp_Is_quarter_start           0
 23                          timestamp_Minute_cos_2           0
 
 [77 rows x 2 columns],
                              feature  importance
 45          energy_consumption_lag_1        1200
 35                             LCLid        1084
 73          energy_consumption_lag_2         690
 4   e

In [65]:
temp = feat_df_l.copy()

In [66]:
feat_df

Unnamed: 0,feature,importance
45,energy_consumption_lag_1,751
35,LCLid,562
12,energy_consumption_336_seasonal_rolling_3_mean,532
4,energy_consumption_rolling_3_std,442
25,energy_consumption_336_seasonal_rolling_3_std,437
...,...,...
31,timestamp_Month,0
51,timestamp_Minute_sin_4,0
14,timestamp_Month_cos_2,0
46,timestamp_Month_cos_3,0


In [67]:
#Averaging feature importance across partitions (Dirty Approximation)
feat_df = feat_df_l.pop(0)
for i, d in enumerate(feat_df_l):
    feat_df = feat_df.merge(d, on="feature",suffixes=("","_{i}"))

feat_df = feat_df.set_index('feature')
feat_df["importance"] = feat_df.sum(axis=1)
feat_df = feat_df.reset_index()
feat_df = feat_df.loc[:, ["feature", "importance"]]

In [68]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Aggregate Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/random_partition_fimp.png")
fig.show()

### Judgmental

We can partition based on ACORN Groups

In [69]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "LCLid"]
_feat_config = FeatureConfig(**feat_conf_dict)

In [70]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params, verbose=-1),
    name="Tuned GFM+Meta+ACORN Part",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [71]:
y_pred_l = []
feat_df_l = []
time_elapsed_l = []
for acn in tqdm(train_df["Acorn_grouped"].unique(), desc="Training groups..."):
    _train_df = train_df.loc[train_df.Acorn_grouped == acn]
    _test_df = test_df.loc[test_df.Acorn_grouped == acn]
    train_features, train_target, train_original_target = _feat_config.get_X_y(
        _train_df, categorical=True, exogenous=False
    )
    # Loading the Validation as test
    test_features, test_target, test_original_target = _feat_config.get_X_y(
        _test_df, categorical=True, exogenous=False
    )
    cat_features = set(train_features.columns).intersection(
        _feat_config.categorical_features
    )
    _model_config = model_config.clone()
    with LogTime() as timer:
        y_pred, feat_df = train_model(
            _model_config,
            _feat_config,
            missing_value_config,
            train_features,
            train_target,
            test_features,
            fit_kwargs=dict(categorical_feature=cat_features),
        )
    y_pred_l.append(y_pred)
    feat_df_l.append(feat_df)
    time_elapsed_l.append(timer.elapsed)

Training groups...:   0%|          | 0/3 [00:00<?, ?it/s]

Time Elapsed: 34 seconds
Time Elapsed: 29 seconds
Time Elapsed: 30 seconds


In [72]:
pred_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,energy_consumption,GFM Baseline,GFM+Meta (CountEncoder),GFM+Meta (TargetEncoder),GFM+Meta (NativeLGBM),Tuned GFM+Meta,Tuned GFM+Meta+Random Part
LCLid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MAC000061,2014-01-01 00:00:00,0.165,0.13017,0.127625,0.130004,0.128841,0.113652,0.128507
MAC000061,2014-01-01 00:30:00,0.167,0.117602,0.120527,0.119749,0.117148,0.099891,0.117822


In [73]:
y_pred.reset_index().set_index(['LCLid','timestamp'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Tuned GFM+Meta+ACORN Part
LCLid,timestamp,Unnamed: 2_level_1
MAC000066,2014-01-01 00:00:00,0.162444
MAC000066,2014-01-01 00:30:00,0.174698
MAC000066,2014-01-01 01:00:00,0.155056
MAC000066,2014-01-01 01:30:00,0.155971
MAC000066,2014-01-01 02:00:00,0.162342
...,...,...
MAC005521,2014-01-31 21:30:00,0.107321
MAC005521,2014-01-31 22:00:00,0.111724
MAC005521,2014-01-31 22:30:00,0.093813
MAC005521,2014-01-31 23:00:00,0.114839


In [74]:
y_pred = pd.concat(y_pred_l)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)
train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)

agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = np.sum(time_elapsed_l)
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
#pred_df = pred_df.join(y_pred.reset_index().set_index(['LCLid','timestamp']))

Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [75]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089
3,GFM+Meta (TargetEncoder),0.079553,0.027241,1.011387,0.299803,102.253439
4,GFM+Meta (NativeLGBM),0.079338,0.027199,1.011089,0.047789,66.527437
5,Tuned GFM+Meta,0.074073,0.030878,0.91689,-12.17957,94.305064
6,Tuned GFM+Meta+Random Part,0.073132,0.030865,0.905318,-12.380182,103.310471
7,Tuned GFM+Meta+ACORN Part,0.073037,0.030535,0.904966,-12.22154,95.054462


In [76]:
feat_df

Unnamed: 0,feature,importance
45,energy_consumption_lag_1,857
35,LCLid,600
4,energy_consumption_rolling_3_std,474
72,energy_consumption_lag_2,458
12,energy_consumption_336_seasonal_rolling_3_mean,432
...,...,...
18,timestamp_Is_month_start,0
51,timestamp_Minute_sin_4,0
19,timestamp_Is_year_start,0
46,timestamp_Month_cos_3,0


In [77]:
#Averaging feature importance across partitions (Dirty Approximation)
feat_df = feat_df_l.pop(0)
for i, d in enumerate(feat_df_l):
    feat_df = feat_df.merge(d, on="feature",suffixes=("","_{i}"))

feat_df = feat_df.set_index('feature')
feat_df["importance"] = feat_df.sum(axis=1)
    
feat_df = feat_df.reset_index()

feat_df = feat_df.loc[:, ["feature", "importance"]]

In [78]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Aggregate Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/acorn_partition_fimp.png")
fig.show()

### Algorithmic

In [79]:
feat_conf_dict = copy.deepcopy(feat_config.__dict__)
feat_conf_dict.pop("feature_list")
feat_conf_dict['categorical_features']+=["stdorToU", "Acorn", "LCLid", "Acorn_grouped"]
_feat_config = FeatureConfig(**feat_conf_dict)

#### Creating Statistical Features for the different households

In [80]:
import tsfel
cfg = tsfel.get_features_by_domain("statistical")
cfg = {**cfg, **tsfel.get_features_by_domain("temporal")}

uniq_ids = train_df.LCLid.cat.categories

stat_df = []
for id_ in tqdm(uniq_ids, desc="Calculating features for all households"):
    ts = train_df.loc[train_df.LCLid==id_, "energy_consumption"]
    res = tsfel.time_series_features_extractor(cfg, ts, verbose=False)
    res['LCLid'] = id_
    stat_df.append(res)

stat_df = pd.concat(stat_df).set_index("LCLid")
del res
stat_df.head()

Calculating features for all households:   0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0_level_0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Average power,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,...,0_Peak to peak distance,0_Positive turning points,0_Root mean square,0_Signal distance,0_Skewness,0_Slope,0_Standard deviation,0_Sum absolute diff,0_Variance,0_Zero crossing rate
LCLid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAC000061,851.246317,43.296783,851.246317,2.426102,174.431914,7017.0,28070.0,0.048,0.209,2.8e-05,...,1.152,7989.0,0.155757,35156.343443,1.227616,1.339344e-07,0.095042,1226.050345,0.009033,106.0
MAC000062,2627.987007,84.536642,2627.987007,7.489917,184.162699,7017.0,28070.0,0.158,0.296,2.8e-05,...,2.655,11613.0,0.273673,35374.532035,4.17389,1.153972e-06,0.12979,2641.711059,0.016845,0.0
MAC000066,1123.258387,49.577472,1123.258387,3.201352,173.921136,7017.0,28070.0,0.057,0.174,2.8e-05,...,1.935,10943.0,0.178921,35227.17361,4.09796,8.020245e-08,0.109757,1355.027212,0.012047,0.0
MAC000086,1243.110599,43.137558,1243.110599,3.542938,175.534717,7017.0,28070.0,0.045,0.157,2.8e-05,...,1.82,10518.0,0.188224,35457.31625,4.63843,4.580171e-07,0.142524,3015.591537,0.020313,0.0
MAC000126,1300.694345,38.672155,1300.694345,3.707055,166.246341,7017.0,28070.0,0.025,0.157,2.8e-05,...,2.005673,11023.0,0.192534,35386.910614,3.814583,-3.559724e-07,0.157884,2314.231594,0.024927,1068.0


#### Clustering the different households

In [81]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from src.utils.data_utils import replace_array_in_dataframe
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding

In [82]:
# Standardizing to make distance calculation fair
X_std = replace_array_in_dataframe(stat_df, StandardScaler().fit_transform(stat_df))
#Non-Linear Dimensionality Reduction
tsne = TSNE(n_components=2, perplexity=50, learning_rate="auto", init="pca", random_state=42, metric="cosine")
X_tsne = tsne.fit_transform(X_std.values)
# Clustering reduced dimensions into 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_tsne)
cluster_df = pd.Series(kmeans.labels_, index=X_std.index)

In [83]:
plot_df = pd.DataFrame(X_tsne, columns=["dim_1", "dim_2"], index=stat_df.index).reset_index()
plot_df["clusters"] = kmeans.labels_
plot_df["clusters"] = plot_df["clusters"].astype(str)

In [84]:
fig = px.scatter(plot_df, x="dim_1", y="dim_2", color="clusters", symbol="clusters", hover_name="LCLid")
fig.update_traces(marker=dict(size=8), selector=dict(mode='markers'))
format_plot(fig, xlabel="Dimension 1", ylabel="Dimension 1", title=f"Clustered t-SNE", font_size=16, legends=["Cluster 1", "Cluster 2", "Cluster 3"])
fig.write_image("imgs/chapter_10/clusters_tsne.png")
fig.show()

#### Using the clusters to partition

In [85]:
from lightgbm import LGBMRegressor
model_config = ModelConfig(
    model=LGBMRegressor(**best_params, verbose=-1),
    name="Tuned GFM+Meta+Clustered Part",
    # LGBM is not sensitive to normalized data
    normalize=False,
    # LGBM can handle missing values
    fill_missing=False,
)

In [86]:
y_pred_l = []
feat_df_l = []
time_elapsed_l = []
for acn in tqdm(cluster_df.unique(), desc="Training groups..."):
    lclids = cluster_df[cluster_df==acn].index
    _train_df = train_df.loc[train_df.LCLid.isin(lclids)]
    _test_df = test_df.loc[test_df.LCLid.isin(lclids)]
    train_features, train_target, train_original_target = _feat_config.get_X_y(
        _train_df, categorical=True, exogenous=False
    )
    # Loading the Validation as test
    test_features, test_target, test_original_target = _feat_config.get_X_y(
        _test_df, categorical=True, exogenous=False
    )
    cat_features = set(train_features.columns).intersection(
        _feat_config.categorical_features
    )
    _model_config = model_config.clone()
    with LogTime() as timer:
        y_pred, feat_df = train_model(
            _model_config,
            _feat_config,
            missing_value_config,
            train_features,
            train_target,
            test_features,
            fit_kwargs=dict(categorical_feature=cat_features),
        )
    y_pred_l.append(y_pred)
    feat_df_l.append(feat_df)
    time_elapsed_l.append(timer.elapsed)

Training groups...:   0%|          | 0/3 [00:00<?, ?it/s]

Time Elapsed: 33 seconds
Time Elapsed: 41 seconds
Time Elapsed: 24 seconds


In [87]:
y_pred = pd.concat(y_pred_l)

test_features, test_target, test_original_target = _feat_config.get_X_y(
    test_df, categorical=True, exogenous=False
)
train_features, train_target, train_original_target = _feat_config.get_X_y(
    train_df, categorical=True, exogenous=False
)

agg_metrics, eval_metrics_df = evaluate_forecast(y_pred, test_target, train_target, model_config)
agg_metrics["Time Elapsed"] = np.sum(time_elapsed_l)
metric_record.append(agg_metrics)
individual_metrics[model_config.name]=eval_metrics_df
#pred_df = pred_df.join(y_pred)

Calculating metrics...:   0%|          | 0/150 [00:00<?, ?it/s]

In [88]:
pd.DataFrame(metric_record)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.077107,0.027468,0.977439,0.032495,
1,GFM Baseline,0.079552,0.027332,1.012078,0.190848,39.544276
2,GFM+Meta (CountEncoder),0.079412,0.027232,1.011083,0.091026,112.729089
3,GFM+Meta (TargetEncoder),0.079553,0.027241,1.011387,0.299803,102.253439
4,GFM+Meta (NativeLGBM),0.079338,0.027199,1.011089,0.047789,66.527437
5,Tuned GFM+Meta,0.074073,0.030878,0.91689,-12.17957,94.305064
6,Tuned GFM+Meta+Random Part,0.073132,0.030865,0.905318,-12.380182,103.310471
7,Tuned GFM+Meta+ACORN Part,0.073037,0.030535,0.904966,-12.22154,95.054462
8,Tuned GFM+Meta+Clustered Part,0.073181,0.030572,0.915675,-12.595349,99.653251


In [89]:
#Averaging feature importance across partitions (Dirty Approximation)
feat_df = feat_df_l.pop(0)
for i, d in enumerate(feat_df_l):
    feat_df = feat_df.merge(d, on="feature",suffixes=("","_{i}"))
feat_df = feat_df.set_index('feature')
feat_df["importance"] = feat_df.sum(axis=1)
feat_df = feat_df.reset_index()
feat_df = feat_df.loc[:, ["feature", "importance"]].sort_values("importance", ascending=False)

In [90]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Aggregate Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_10/clustered_partition_fimp.png")
fig.show()

## Summary

In [91]:
def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

In [92]:
agg_metrics = pd.DataFrame(metric_record)
agg_metrics.style.format(
    {"MAE": "{:.4f}", "MSE": "{:.4f}", "meanMASE": "{:.4f}", "Forecast Bias": "{:.2f}%"}
).highlight_min(color="lightgreen", subset=["MAE", "MSE", "meanMASE"]).apply(
    highlight_abs_min,
    props="color:black;background-color:lightgreen",
    axis=0,
    subset=["Forecast Bias"],
)

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias,Time Elapsed
0,LightGBM,0.0771,0.0275,0.9774,0.03%,
1,GFM Baseline,0.0796,0.0273,1.0121,0.19%,39.544276
2,GFM+Meta (CountEncoder),0.0794,0.0272,1.0111,0.09%,112.729089
3,GFM+Meta (TargetEncoder),0.0796,0.0272,1.0114,0.30%,102.253439
4,GFM+Meta (NativeLGBM),0.0793,0.0272,1.0111,0.05%,66.527437
5,Tuned GFM+Meta,0.0741,0.0309,0.9169,-12.18%,94.305064
6,Tuned GFM+Meta+Random Part,0.0731,0.0309,0.9053,-12.38%,103.310471
7,Tuned GFM+Meta+ACORN Part,0.073,0.0305,0.905,-12.22%,95.054462
8,Tuned GFM+Meta+Clustered Part,0.0732,0.0306,0.9157,-12.60%,99.653251


In [93]:
pred_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,energy_consumption,GFM Baseline,GFM+Meta (CountEncoder),GFM+Meta (TargetEncoder),GFM+Meta (NativeLGBM),Tuned GFM+Meta,Tuned GFM+Meta+Random Part
LCLid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MAC000061,2014-01-01 00:00:00,0.165,0.13017,0.127625,0.130004,0.128841,0.113652,0.128507
MAC000061,2014-01-01 00:30:00,0.167,0.117602,0.120527,0.119749,0.117148,0.099891,0.117822
MAC000061,2014-01-01 01:00:00,0.15,0.121907,0.126834,0.127883,0.128681,0.137708,0.152569
MAC000061,2014-01-01 01:30:00,0.091,0.115268,0.115479,0.117781,0.119456,0.129633,0.152024
MAC000061,2014-01-01 02:00:00,0.047,0.077152,0.078502,0.079642,0.079606,0.063214,0.070015


In [94]:
individual_metrics.keys()

dict_keys(['GFM Baseline', 'GFM+Meta (CountEncoder)', 'GFM+Meta  (TargetEncoder)', 'GFM+Meta  (NativeLGBM)', 'Tuned GFM+Meta', 'Tuned GFM+Meta+Random Part', 'Tuned GFM+Meta+ACORN Part', 'Tuned GFM+Meta+Clustered Part'])

# Saving the GFM Forecasts and Metrics

In [95]:
os.makedirs("data/london_smart_meters/output", exist_ok=True)
output = Path("data/london_smart_meters/output")

In [96]:
pred_df.to_pickle(output/"gfm_predictions_val_df.pkl")
joblib.dump(individual_metrics, output/"gfm_metrics_val_df.pkl")
agg_metrics.to_pickle(output/"gfm_aggregate_metrics_val.pkl")