In [1]:
%cd ../..

D:\Playground\AdvancedTimeSeriesForecastingBook\Github\Modern-Time-Series-Forecasting-with-Python-


In [2]:
import numpy as np
import pandas as pd
import time
import plotly.express as px
import plotly.graph_objects as go
import os
import plotly.io as pio
pio.templates.default = "plotly_white"

from pathlib import Path
from tqdm.autonotebook import tqdm
import warnings
import humanize
import joblib

from darts.metrics import mase, mse, mae
from sklearn.preprocessing import StandardScaler

from src.utils.ts_utils import forecast_bias, darts_metrics_adapter
from src.utils.general import LogTime
from src.utils import plotting_utils
%load_ext autoreload
%autoreload 2
np.random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [3]:
os.makedirs("imgs/chapter_7", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
def format_plot(fig, legends = None, xlabel="Time", ylabel="Value", title=""):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t:  t.update(name = next(names)))
    fig.update_layout(
            autosize=False,
            width=900,
            height=500,
            title_text=title,
            title={
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            titlefont={
                "size": 20
            },
            legend_title = None,
            yaxis=dict(
                title_text=ylabel,
                titlefont=dict(size=12),
            ),
            xaxis=dict(
                title_text=xlabel,
                titlefont=dict(size=12),
            )
        )
    return fig

In [5]:
#Readin the missing value imputed and train test split data
train_df = pd.read_parquet(preprocessed/"block_0-7_train_missing_imputed_feature_engg.parquet")
auto_stat_target = pd.read_parquet(preprocessed/"block_0-7_train_auto_stat_target.parquet")
transformer_pipelines = joblib.load(preprocessed/"auto_transformer_pipelines_train.pkl")
#Reading in validation as test
test_df = pd.read_parquet(preprocessed/"block_0-7_val_missing_imputed_feature_engg.parquet")

In [6]:
len(train_df.LCLid.unique())

399

In [7]:
train_df = train_df.set_index(['LCLid','timestamp']).join(auto_stat_target).reset_index()
test_df.rename(columns={"energy_consumption":"energy_consumption_auto_stat"}, inplace=True)

### Loading the single step backtesting baselines for validation

In [None]:
baseline_metrics_df = pd.read_pickle(output/"single_step_backtesting_baseline_metrics_val_df.pkl")
baseline_aggregate_metrics_df = pd.read_pickle(output/"single_step_backtesting_baseline_aggregate_metrics_val.pkl")
# baseline_metrics_test_df = pd.read_pickle(output/"single_step_backtesting_baseline_metrics_test_df.pkl")
# baseline_aggregate_metrics_test_df = pd.read_pickle(output/"single_step_backtesting_baseline_aggregate_metrics_test.pkl")

# Feature Definition

In [None]:
date_col = "timestamp"
target_col = "energy_consumption"
index_cols = [date_col]
categorical_features= [
    'holidays',
    'precipType',
    'icon',
    'summary',
    'timestamp_Month',
    'timestamp_Quarter',
    'timestamp_WeekDay',
    'timestamp_Dayofweek',
    'timestamp_Dayofyear',
    'timestamp_Hour',
    'timestamp_Minute',

]
boolean_features = [
    'timestamp_Is_quarter_end',
    'timestamp_Is_quarter_start',
    'timestamp_Is_year_end',
    'timestamp_Is_year_start',
    'timestamp_Is_month_start',
]
continuous_features = [
 'visibility',
 'windBearing',
 'temperature',
 'dewPoint',
 'pressure',
 'apparentTemperature',
 'windSpeed',
 'humidity',
 'energy_consumption_lag_1',
 'energy_consumption_lag_2',
 'energy_consumption_lag_3',
 'energy_consumption_lag_4',
 'energy_consumption_lag_5',
 'energy_consumption_lag_49',
 'energy_consumption_lag_50',
 'energy_consumption_lag_51',
 'energy_consumption_lag_52',
 'energy_consumption_lag_53',
 'energy_consumption_rolling_3_mean',
 'energy_consumption_rolling_3_std',
 'energy_consumption_rolling_6_mean',
 'energy_consumption_rolling_6_std',
 'energy_consumption_rolling_12_mean',
 'energy_consumption_rolling_12_std',
 'energy_consumption_rolling_48_mean',
 'energy_consumption_rolling_48_std',
 'energy_consumption_48_seasonal_rolling_3_mean',
 'energy_consumption_48_seasonal_rolling_3_std',
 'energy_consumption_336_seasonal_rolling_3_mean',
 'energy_consumption_336_seasonal_rolling_3_std',
 'energy_consumption_ewma__span_2880',
 'energy_consumption_ewma__span_336',
 'energy_consumption_ewma__span_48',
 'timestamp_Elapsed',
 'timestamp_Month_sin_1',
 'timestamp_Month_sin_2',
 'timestamp_Month_sin_3',
 'timestamp_Month_sin_4',
 'timestamp_Month_sin_5',
 'timestamp_Month_cos_1',
 'timestamp_Month_cos_2',
 'timestamp_Month_cos_3',
 'timestamp_Month_cos_4',
 'timestamp_Month_cos_5',
 'timestamp_Hour_sin_1',
 'timestamp_Hour_sin_2',
 'timestamp_Hour_sin_3',
 'timestamp_Hour_sin_4',
 'timestamp_Hour_sin_5',
 'timestamp_Hour_cos_1',
 'timestamp_Hour_cos_2',
 'timestamp_Hour_cos_3',
 'timestamp_Hour_cos_4',
 'timestamp_Hour_cos_5',
 'timestamp_Minute_sin_1',
 'timestamp_Minute_sin_2',
 'timestamp_Minute_sin_3',
 'timestamp_Minute_sin_4',
 'timestamp_Minute_sin_5',
 'timestamp_Minute_cos_1',
 'timestamp_Minute_cos_2',
 'timestamp_Minute_cos_3',
 'timestamp_Minute_cos_4',
 'timestamp_Minute_cos_5']

exogenous_variables = [
    'holidays',
    'precipType',
    'icon',
    'summary',
    'visibility',
    'windBearing',
    'temperature',
    'dewPoint',
    'pressure',
    'apparentTemperature',
    'windSpeed',
    'humidity',
]

feature_list = categorical_features+continuous_features+boolean_features

assert target_col not in feature_list
assert date_col not in feature_list
assert len(set(exogenous_variables)-set(feature_list)) == 0
assert np.all([col in feature_list+[target_col, date_col] for col in index_cols])
dropped_columns = set(train_df.columns) - set(feature_list+[target_col, date_col])
print(f"Columns not being used: {dropped_columns}")

# Feature and Target Preparation

In [None]:
def get_X_y(df, categorical=False, exogenous=False):
    feature_list = continuous_features
    if categorical:
        feature_list += categorical_features
    if not exogenous:
        feature_list = list(set(feature_list)-set(exogenous_variables))
    return df.loc[:, feature_list+index_cols].set_index(index_cols), df.loc[:, [target_col]+index_cols].set_index(index_cols)

# Sample Household

In [None]:
train_features, train_target = get_X_y(train_df.loc[train_df.LCLid=="MAC000193",:], categorical=False, exogenous=False)
#Loading the Validation as test
test_features, test_target = get_X_y(test_df.loc[test_df.LCLid=="MAC000193",:], categorical=False, exogenous=False)

# Missing Value Handling

## Null check

In [None]:
nc = train_features.isnull().sum()
nc[nc>0]

In [None]:
nc = test_features.isnull().sum()
nc[nc>0]

In [None]:
bfill_cols=['energy_consumption_lag_1',
 'energy_consumption_lag_2',
 'energy_consumption_lag_3',
 'energy_consumption_lag_4',
 'energy_consumption_lag_5',
 'energy_consumption_lag_49',
 'energy_consumption_lag_50',
 'energy_consumption_lag_51',
 'energy_consumption_lag_52',
 'energy_consumption_lag_53',
 'energy_consumption_rolling_3_mean',
 'energy_consumption_rolling_3_std',
 'energy_consumption_rolling_6_mean',
 'energy_consumption_rolling_6_std',
 'energy_consumption_rolling_12_mean',
 'energy_consumption_rolling_12_std',
 'energy_consumption_rolling_48_mean',
 'energy_consumption_rolling_48_std',
 'energy_consumption_48_seasonal_rolling_3_mean',
 'energy_consumption_48_seasonal_rolling_3_std',
 'energy_consumption_336_seasonal_rolling_3_mean',
 'energy_consumption_336_seasonal_rolling_3_std',
 'energy_consumption_ewma__span_2880',
 'energy_consumption_ewma__span_336',
 'energy_consumption_ewma__span_48']
zero_fill_cols = []

def impute_missing_values(df):
    df = df.copy()
    df[bfill_cols] = df[bfill_cols].fillna(method="bfill")
    df[zero_fill_cols] = df[zero_fill_cols].fillna(0)
    # Filling with mean as default fillna strategy
    return df.fillna(df.mean())

# Running ML models on a Sample household

In [None]:
pred_df = pd.concat([train_target, test_target])
metric_record = []

In [None]:
metric_record+=baseline_metrics_df.loc[baseline_metrics_df.LCLid=="MAC000193"].drop(columns="LCLid").to_dict(orient="records")

In [None]:
metric_record

In [None]:
def intersect_list(list1, list2):
    return list(set(list1).intersection(set(list2)))

def subtract_list(list1, list2):
    return list(set(list1)- set(list2))

def union_list(list1, list2):
    return list(set(list1).union(set(list2)))

In [None]:
def eval_model(model, train_features, train_target, test_features, test_target, name=None, fit_kwargs={}, normalize=True, fill_missing=True):
    if name is None:
        name = type(model).__name__
    if fill_missing:
        train_features = impute_missing_values(train_features)
        test_features = impute_missing_values(test_features)
    else:
        train_features = train_features.copy()
        test_features = test_features.copy()
    if normalize:
        scaler=StandardScaler()
        cont_cols =intersect_list(continuous_features, train_features.columns)
        train_features[cont_cols] = scaler.fit_transform(train_features[cont_cols])
        test_features[cont_cols] = scaler.transform(test_features[cont_cols])
    model.fit(train_features, train_target.values.ravel(), **fit_kwargs)
    if hasattr(model, "coef_") or hasattr(model, "feature_importances_"):
        feat_df = pd.DataFrame({"feature":train_features.columns,"importance":model.coef_.ravel() if hasattr(model, "coef_") else model.feature_importances_.ravel()})
        feat_df["_abs_imp"] = np.abs(feat_df.importance)
        feat_df = feat_df.sort_values("_abs_imp", ascending=False).drop(columns="_abs_imp")
    else:
        feat_df = pd.DataFrame()
    y_pred = model.predict(test_features)
    y_pred = pd.Series(y_pred.ravel(), index=test_target.index, name=name)
    return y_pred, {
        "Algorithm": name,
        "MAE": darts_metrics_adapter(mae, actual_series = test_target, pred_series = y_pred),
        "MSE": darts_metrics_adapter(mse, actual_series = test_target, pred_series = y_pred),
        "MASE": darts_metrics_adapter(mase, actual_series = test_target, pred_series = y_pred, insample=train_target),
        "Forecast Bias": darts_metrics_adapter(forecast_bias, actual_series = test_target, pred_series = y_pred)
    }, feat_df

In [None]:
from itertools import cycle
def plot_forecast(pred_df, forecast_columns, forecast_display_names=None):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns)==len(forecast_display_names)
    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = ["rgba("+",".join([str(c) for c in plotting_utils.hex_to_rgb(c)])+",<alpha>)" for c in px.colors.qualitative.Plotly]
    act_color = colors[0]
    colors = cycle(colors[1:])
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=pred_df[mask].index, y=pred_df[mask].energy_consumption,
                        mode='lines', line = dict(color=act_color.replace("<alpha>", "0.9")),
                        name='Actual Consumption'))
    for col, display_col in zip(forecast_columns,forecast_display_names):
        fig.add_trace(go.Scatter(x=pred_df[mask].index, y=pred_df.loc[mask, col],
                            mode='lines', line = dict(dash='dot', color=next(colors).replace("<alpha>", "1")),
                            name=display_col))
    return fig

## Linear Models

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV

### Linear Regression

In [None]:
name = "Linear Regression"
model = LinearRegression()
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=True, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=[name])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/lin_reg.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/lin_reg_fimp.png")
fig.show()

### Ridge Regression (L2)

In [None]:
name = "Ridge Regression"
model = RidgeCV()
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=True, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=['name'])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/ridge_reg.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/ridge_reg_fimp.png")
fig.show()

### Lasso Regression (L1)

In [None]:
name = "Lasso Regression"
model = LassoCV()
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=True, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=['name'])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/lasso_reg.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/lasso_reg_fimp.png")
fig.show()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
name = "Decision Tree"
model = DecisionTreeRegressor(max_depth = 4, random_state=42)
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=True, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=[name])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/dtree.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/dtree_fimp.png")
fig.show()

## Bagging and Boosting Trees

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
name = "Random Forest"
model = RandomForestRegressor(random_state=42, max_depth=4)
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=False, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=[name])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/rf.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/rf_fimp.png")
fig.show()

### XGBoost Random Forest

In [None]:
from xgboost import XGBRFRegressor

In [None]:
name = "XGB Random Forest"
model = XGBRFRegressor(random_state=42, max_depth=4)
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=False, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=[name])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/xgbrf.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/xgbrf_fimp.png")
fig.show()

### LightGBM

In [None]:
from lightgbm import LGBMRegressor

In [None]:
name = "LightGBM"
model = LGBMRegressor(random_state=42)
with LogTime() as timer:
    y_pred, metrics, feat_df = eval_model(model, train_features, train_target, test_features, test_target, name=name, normalize=False, fill_missing=True)
metrics['Time Elapsed'] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[name], forecast_display_names=[name])
fig = format_plot(fig, title=f"{name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_7/lgbm.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {name}")
fig.write_image("imgs/chapter_7/lgbm_fimp.png")
fig.show()

## Summary

In [None]:
def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.3f}", 
                          "MSE": "{:.3f}", 
                          "MASE": "{:.3f}", 
                          "Forecast Bias": "{:.2f}%"})
formatted.highlight_min(color='lightgreen', subset=["MAE","MSE","MASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

# Running ML Forecast for all consumers

Running Lasso Regression, XGB Random Forest, and LightGBM

In [None]:
from sklearn.base import clone
from collections import namedtuple

In [None]:
lcl_ids = sorted(train_df.LCLid.unique())

model_tuple = namedtuple("Model", ["name", "model", "normalize", "fill_missing", "encode_categorical"])
models_to_run = [
    model_tuple("Lasso", LassoCV(random_state=42), True, True, True),
    model_tuple("XGBRandomForest", XGBRFRegressor(random_state=42), False, True, True),
    model_tuple("LightGBM", LGBMRegressor(random_state=42), False, True, False),
]

In [None]:
all_preds = []
all_metrics = []
#We can parallelize this loop to run this faster
for lcl_id in tqdm(lcl_ids):
    for model in models_to_run:
        _model = clone(model.model)
        X_train, y_train = get_X_y(train_df.loc[train_df.LCLid==lcl_id,:], categorical=False, exogenous=False)
        X_test, y_test = get_X_y(test_df.loc[test_df.LCLid==lcl_id,:], categorical=False, exogenous=False)
        y_pred, metrics, feat_df = eval_model(_model, X_train, y_train, X_test, y_test, name=model.name, normalize=model.normalize, fill_missing=model.fill_missing)
        y_pred.name = "predictions"
        y_pred = y_pred.to_frame()
        y_pred['LCLid'] = lcl_id
        y_pred['Algorithm'] = model.name
        metrics["LCLid"] = lcl_id
        metrics["Algorithm"] = model.name
        y_pred['energy_consumption'] = y_test.values
        all_preds.append(y_pred)
        all_metrics.append(metrics)

In [None]:
pred_df = pd.concat(all_preds)
pred_df.head()

In [None]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df.head()

# Evaluation of ML Forecast

In [None]:
from src.utils import ts_utils

In [None]:
baseline_aggregate_metrics_df

In [None]:
metrics = baseline_aggregate_metrics_df.reset_index().rename(columns={"index":"Algorithm"}).to_dict(orient="records")

In [None]:

for model in models_to_run:
    pred_mask = pred_df.Algorithm==model.name
    metric_mask = metrics_df.Algorithm==model.name
    metrics.append({
    "Algorithm": model.name,
    "MAE": ts_utils.mae(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"]),
    "MSE": ts_utils.mse(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"]),
    "meanMASE": metrics_df.loc[metric_mask, "MASE"].mean(),
    "Forecast Bias": ts_utils.forecast_bias_aggregate(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"])
})

In [None]:
agg_metrics_df = pd.DataFrame(metrics)
agg_metrics_df.style.format({"MAE": "{:.3f}", 
                          "MSE": "{:.3f}", 
                          "meanMASE": "{:.3f}", 
                          "Forecast Bias": "{:.2f}%"}).highlight_min(color='lightgreen', subset=["MAE","MSE","meanMASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

In [None]:
fig = px.histogram(metrics_df, 
                   x="MASE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=500, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MASE", ylabel="Probability Density", title="Distribution of MASE in the dataset")
fig.update_layout(xaxis_range=[0,2.5])
fig.write_image("imgs/chapter_7/mase_dist.png")
fig.show()

In [None]:
fig = px.histogram(metrics_df, 
                   x="MAE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=100, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MAE", ylabel="Probability Density", title="Distribution of MAE in the dataset")
fig.write_image("imgs/chapter_7/mae_dist.png")
fig.update_layout(xaxis_range=[0,0.4])
fig.show()

In [None]:
fig = px.histogram(metrics_df, 
                   x="MSE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=500, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MSE", ylabel="Probability Density", title="Distribution of MSE in the dataset")
fig.update_layout(xaxis_range=[0,0.3])
fig.write_image("imgs/chapter_7/mse_dist.png")
fig.show()

In [None]:
fig = px.histogram(metrics_df, 
                   x="Forecast Bias", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=250,
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="Forecast Bias", ylabel="Probability Density", title="Distribution of Forecast Bias in the dataset")
fig.update_layout(xaxis_range=[-50,30])
fig.write_image("imgs/chapter_7/bias_dist.png")
fig.show()

# Saving the Baseline Forecasts and Metrics

In [None]:
os.makedirs("data/london_smart_meters/output", exist_ok=True)
output = Path("data/london_smart_meters/output")

In [None]:
pred_df.to_pickle(output/"ml_single_step_prediction_auto_stationary_val_df.pkl")
metrics_df.to_pickle(output/"ml_single_step_metrics_auto_stationary_val_df.pkl")
agg_metrics_df.to_pickle(output/"ml_single_step_aggregate_metrics_auto_stationary_val.pkl")