In [1]:
%cd ../..

c:\Users\tacke\OneDrive\Documents\GitHub\Modern-Time-Series-Forecasting-with-Python-2E-1


In [2]:
import os
import time

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

import warnings
from pathlib import Path

import humanize

from sklearn.preprocessing import StandardScaler
from src.forecasting.ml_forecasting import (
    FeatureConfig,
    MissingValueConfig,
    MLForecast,
    ModelConfig,
    calculate_metrics,
)
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.ts_utils import  forecast_bias, metrics_adapter, mae, mse, mase
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML

# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [3]:
os.makedirs("imgs/chapter_8", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        )
    )
    return fig

In [5]:
#Readin the missing value imputed and train test split data
try:
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")
    # Read in the Validation dataset as test_df so that we predict on it
    test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")
    # test_df = pd.read_parquet(preprocessed/"block_0-7_test_missing_imputed_feature_engg.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

### Loading the single step backtesting baselines for validation

In [6]:
#Readin the missing value imputed and train test split data
try:
    baseline_metrics_df = pd.read_pickle(output/"single_step_backtesting_baseline_metrics_val_df.pkl")
    baseline_aggregate_metrics_df = pd.read_pickle(output/"single_step_backtesting_baseline_aggregate_metrics_val.pkl")
    # baseline_metrics_test_df = pd.read_pickle(output/"single_step_backtesting_baseline_metrics_test_df.pkl")
    # baseline_aggregate_metrics_test_df = pd.read_pickle(output/"single_step_backtesting_baseline_aggregate_metrics_test.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 00-Single Step Backtesting Baselines.ipynb in Chapter08
    </div>
    """))


In [7]:
len(train_df.LCLid.unique())

150

In [8]:
train_df.columns

Index(['timestamp', 'LCLid', 'energy_consumption', 'frequency',
       'series_length', 'stdorToU', 'Acorn', 'Acorn_grouped', 'file',
       'holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary', 'energy_consumption_lag_1',
       'energy_consumption_lag_2', 'energy_consumption_lag_3',
       'energy_consumption_lag_4', 'energy_consumption_lag_5',
       'energy_consumption_lag_46', 'energy_consumption_lag_47',
       'energy_consumption_lag_48', 'energy_consumption_lag_49',
       'energy_consumption_lag_50', 'energy_consumption_lag_334',
       'energy_consumption_lag_335', 'energy_consumption_lag_336',
       'energy_consumption_lag_337', 'energy_consumption_lag_338',
       'energy_consumption_rolling_3_mean', 'energy_consumption_rolling_3_std',
       'energy_consumption_rolling_6_mean', 'energy_consumption_rolling_6_std',
       'energy_consumption_rolling_12_me

# Feature Definition

In [9]:
feat_config = FeatureConfig(
    date="timestamp",
    target="energy_consumption",
    continuous_features=[
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma_span_2880",
        "energy_consumption_ewma_span_336",
        "energy_consumption_ewma_span_48",
        "timestamp_Elapsed",
        "timestamp_Month_sin_1",
        "timestamp_Month_sin_2",
        "timestamp_Month_sin_3",
        "timestamp_Month_sin_4",
        "timestamp_Month_sin_5",
        "timestamp_Month_cos_1",
        "timestamp_Month_cos_2",
        "timestamp_Month_cos_3",
        "timestamp_Month_cos_4",
        "timestamp_Month_cos_5",
        "timestamp_Hour_sin_1",
        "timestamp_Hour_sin_2",
        "timestamp_Hour_sin_3",
        "timestamp_Hour_sin_4",
        "timestamp_Hour_sin_5",
        "timestamp_Hour_cos_1",
        "timestamp_Hour_cos_2",
        "timestamp_Hour_cos_3",
        "timestamp_Hour_cos_4",
        "timestamp_Hour_cos_5",
        "timestamp_Minute_sin_1",
        "timestamp_Minute_sin_2",
        "timestamp_Minute_sin_3",
        "timestamp_Minute_sin_4",
        "timestamp_Minute_sin_5",
        "timestamp_Minute_cos_1",
        "timestamp_Minute_cos_2",
        "timestamp_Minute_cos_3",
        "timestamp_Minute_cos_4",
        "timestamp_Minute_cos_5",
    ],
    categorical_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "timestamp_Month",
        "timestamp_Quarter",
        "timestamp_WeekDay",
        "timestamp_Dayofweek",
        "timestamp_Dayofyear",
        "timestamp_Hour",
        "timestamp_Minute",
    ],
    boolean_features=[
        "timestamp_Is_quarter_end",
        "timestamp_Is_quarter_start",
        "timestamp_Is_year_end",
        "timestamp_Is_year_start",
        "timestamp_Is_month_start",
    ],
    index_cols=["timestamp"],
    exogenous_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
    ],
)

# Sample Household

In [10]:
sample_train_df = train_df.loc[train_df.LCLid == "MAC000193", :]
sample_test_df = test_df.loc[test_df.LCLid == "MAC000193", :]
train_features, train_target, train_original_target = feat_config.get_X_y(
    sample_train_df, categorical=False, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = feat_config.get_X_y(
    sample_test_df, categorical=False, exogenous=False
)
del sample_train_df, sample_test_df

# Missing Value Handling

## Null check

In [11]:
nc = train_features.isnull().sum()
nc[nc>0]

energy_consumption_rolling_12_std                   12
energy_consumption_lag_338                         338
energy_consumption_lag_49                           49
energy_consumption_336_seasonal_rolling_3_std     1008
energy_consumption_lag_46                           46
energy_consumption_lag_335                         335
energy_consumption_rolling_3_mean                    3
energy_consumption_rolling_6_mean                    6
energy_consumption_lag_337                         337
energy_consumption_rolling_3_std                     3
energy_consumption_rolling_48_mean                  48
energy_consumption_lag_48                           48
energy_consumption_lag_5                             5
energy_consumption_lag_4                             4
energy_consumption_48_seasonal_rolling_3_mean      144
energy_consumption_lag_2                             2
energy_consumption_rolling_48_std                   48
energy_consumption_lag_47                           47
energy_con

In [12]:
nc = test_features.isnull().sum()
nc[nc>0]

Series([], dtype: int64)

In [13]:
missing_value_config = MissingValueConfig(
    bfill_columns=[
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma__span_2880",
        "energy_consumption_ewma__span_336",
        "energy_consumption_ewma__span_48",
    ],
    ffill_columns=[],
    zero_fill_columns=[],
)

# Running ML models on a Sample household

In [14]:
pred_df = pd.concat([train_target, test_target])
metric_record = []

In [15]:
metric_record += (
    baseline_metrics_df.loc[baseline_metrics_df.LCLid == "MAC000193"]
    .drop(columns="LCLid")
    .to_dict(orient="records")
)

In [16]:
metric_record

[{'Algorithm': 'Naive',
  'forecast_bias': -0.02605566382408142,
  'mae': 0.17533333599567413,
  'mase': 1.3463338613510132,
  'mse': 0.10495550185441971,
  'rmse': 0.3239683508872986},
 {'Algorithm': 'SeasonalNaive',
  'forecast_bias': -4.796060562133789,
  'mae': 0.2376619577407837,
  'mase': 1.824937343597412,
  'mse': 0.17094843089580536,
  'rmse': 0.41345909237861633}]

In [17]:
# from typing import Optional, Tuple, Union, Sequence, Callable, cast
# from pandas.api.types import is_datetime64_any_dtype as is_datetime
# def is_datetime_dtypes(x):
#     return is_datetime(x)

# def cast_to_series(df):
#     is_pd_dataframe = isinstance(df, pd.DataFrame)    
#     if is_pd_dataframe: 
#         if df.shape[1]==1:
#             df = df.squeeze()
#         else:
#             raise ValueError("Dataframes with more than one columns cannot be converted to pd.Series")
#     return df

# def metrics_adapter(metric_func, actual_series,
#         pred_series,
#         insample = None,
#         m: Optional[int] = 1,
#         intersect: bool = True,
#         reduction: Callable[[np.ndarray], float] = np.mean,
#         inter_reduction: Callable[[np.ndarray], Union[float, np.ndarray]] = lambda x: x,
#         n_jobs: int = 1,
#         verbose: bool = False):
    
#     actual_series, pred_series = cast_to_series(actual_series), cast_to_series(pred_series)
#     if insample is not None:
#         insample = cast_to_series(insample)
#     assert type(actual_series) is type(pred_series), f"actual_series({type(actual_series)}) and pred_series({type(pred_series)}) should be of same type."
#     if insample is not None:
#         assert type(actual_series) is type(insample), "actual_series and insample should be of same type."
#     is_nd_array = isinstance(actual_series, np.ndarray)
#     is_pd_series = isinstance(actual_series, pd.Series)
    
#     if is_pd_series:
#         is_datetime_index = is_datetime_dtypes(actual_series.index) and is_datetime_dtypes(pred_series.index)
#         if insample is not None:
#             is_datetime_index = is_datetime_index and is_datetime_dtypes(insample.index)
#     else:
#         is_datetime_index = False
#     if metric_func.__name__ == "mase":
#         if not is_datetime_index:
#             raise ValueError("MASE needs pandas Series with datetime index as inputs")
    
#     # if is_nd_array or (is_pd_series and not is_datetime_index):
#     #     actual_series, pred_series = TimeSeries.from_values(actual_series.values if is_pd_series else actual_series), TimeSeries.from_values(pred_series.values if is_pd_series else pred_series)
#     #     if insample is not None:
#     #         insample = TimeSeries.from_values(insample.values if is_pd_series else insample)

#     # elif is_pd_series and is_datetime_index:
#     #     actual_series, pred_series = TimeSeries.from_series(actual_series), TimeSeries.from_series(pred_series)
#     #     if insample is not None:
#     #         insample = TimeSeries.from_series(insample)
#     # else:
#     #     raise ValueError()
#     if metric_func.__name__ == "mase":
#         #return metric_func(actual_series=actual_series, pred_series=pred_series, insample=insample, m=m, intersect=intersect, reduction=reduction, inter_reduction=inter_reduction, n_jobs=n_jobs, verbose=verbose)
#         return metric_func(actual_series, pred_series, insample)

#     else:
#         #return metric_func(actual_series=actual_series, pred_series=pred_series, intersect=intersect, reduction=reduction, inter_reduction=inter_reduction, n_jobs=n_jobs, verbose=verbose)
#         return metric_func(actual_series, pred_series)


# def calculate_metrics(
#     y: pd.Series, y_pred: pd.Series, name: str, y_train: pd.Series = None
# ):
#     """Method to calculate the metrics given the actual and predicted series

#     Args:
#         y (pd.Series): Actual target with datetime index
#         y_pred (pd.Series): Predictions with datetime index
#         name (str): Name or identification for the model
#         y_train (pd.Series, optional): Actual train target to calculate MASE with datetime index. Defaults to None.

#     Returns:
#         Dict: Dictionary with MAE, MSE, MASE, and Forecast Bias
#     """
#     return {
#         "Algorithm": name,
#         "MAE": darts_metrics_adapter(mae, actual_series=y, pred_series=y_pred),
#         "MSE": darts_metrics_adapter(mse, actual_series=y, pred_series=y_pred),
#         "MASE": darts_metrics_adapter(
#             mase, actual_series=y, pred_series=y_pred, insample=y_train
#         )
#         if y_train is not None
#         else None,
#         "Forecast Bias": darts_metrics_adapter(
#             forecast_bias, actual_series=y, pred_series=y_pred
#         )
        
#     }

# def mae(actuals, predictions):
#     return np.nanmean(np.abs(actuals-predictions))

# def mse(actuals, predictions):
#     return np.nanmean(np.power(actuals-predictions, 2))

# def mase(actuals, predictions, insample):
#     """
#     Calculate the Mean Absolute Scaled Error (MASE).
    
#     Parameters:
#     actuals : np.ndarray
#         Actual observed values corresponding to the predictions.
#     predictions : np.ndarray
#         Predicted values.
#     insample : np.ndarray
#         In-sample data to calculate the scaling factor based on a naive forecast.

#     Returns:
#     float
#         The MASE metric.
#     """
#     # Calculate MAE of predictions
#     mae_predictions = np.nanmean(np.abs(actuals - predictions))
    
#     # Shift the insample data to create a simple naive forecast
#     naive_forecast = np.roll(insample, 1)
#     # Assuming the first element is not a valid forecast
#     naive_forecast[0] = np.nan 
    
#     # Calculate MAE of the naive forecast
#     mae_naive = np.nanmean(np.abs(insample - naive_forecast))
    
#     # Calculate MASE
#     mase_value = mae_predictions / mae_naive
#     return mase_value


# def _remove_nan_union(array_a: np.ndarray,
#                       array_b: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
#     """
#     Returns the two inputs arrays where all elements are deleted that have an index that corresponds to
#     a NaN value in either of the two input arrays.
#     """

#     isnan_mask = np.logical_or(np.isnan(array_a), np.isnan(array_b))
#     return np.delete(array_a, isnan_mask), np.delete(array_b, isnan_mask)

# def forecast_bias(actual_series: Union[ np.ndarray],
#         pred_series: Union[ np.ndarray],
#         intersect: bool = True,
#         *,
#         reduction: Callable[[np.ndarray], float] = np.mean,
#         inter_reduction: Callable[[np.ndarray], Union[float, np.ndarray]] = lambda x: x,
#         n_jobs: int = 1,
#         verbose: bool = False) -> Union[float, np.ndarray]:
#     """ Forecast Bias (FB).

#     Given a time series of actual values :math:`y_t` and a time series of predicted values :math:`\\hat{y}_t`
#     both of length :math:`T`, it is a percentage value computed as

#     .. math:: 100 \\cdot \\frac{\\sum_{t=1}^{T}{y_t}
#               - \\sum_{t=1}^{T}{\\hat{y}_t}}{\\sum_{t=1}^{T}{y_t}}.

#     If any of the series is stochastic (containing several samples), the median sample value is considered.

#     Parameters
#     ----------
#     actual_series
#         The `TimeSeries` or `Sequence[TimeSeries]` of actual values.
#     pred_series
#         The `TimeSeries` or `Sequence[TimeSeries]` of predicted values.
#     intersect
#         For time series that are overlapping in time without having the same time index, setting `intersect=True`
#         will consider the values only over their common time interval (intersection in time).
#     reduction
#         Function taking as input a `np.ndarray` and returning a scalar value. This function is used to aggregate
#         the metrics of different components in case of multivariate `TimeSeries` instances.
#     inter_reduction
#         Function taking as input a `np.ndarray` and returning either a scalar value or a `np.ndarray`.
#         This function can be used to aggregate the metrics of different series in case the metric is evaluated on a
#         `Sequence[TimeSeries]`. Defaults to the identity function, which returns the pairwise metrics for each pair
#         of `TimeSeries` received in input. Example: `inter_reduction=np.mean`, will return the average of the pairwise
#         metrics.
#     n_jobs
#         The number of jobs to run in parallel. Parallel jobs are created only when a `Sequence[TimeSeries]` is
#         passed as input, parallelising operations regarding different `TimeSeries`. Defaults to `1`
#         (sequential). Setting the parameter to `-1` means using all the available processors.
#     verbose
#         Optionally, whether to print operations progress

#     Raises
#     ------
#     ValueError
#         If :math:`\\sum_{t=1}^{T}{y_t} = 0`.

#     Returns
#     -------
#     float
#         The Forecast Bias (OPE)
#     """
#     assert type(actual_series) is type(pred_series), "actual_series and pred_series should be of same type."
#     if isinstance(actual_series, np.ndarray):
#         y_true, y_pred = actual_series, pred_series
#     else:
#         y_true = actual_series
#         y_pred = pred_series
#     #     y_true, y_pred = _get_values_or_raise(actual_series, pred_series, intersect)
#     #y_true, y_pred = _remove_nan_union(y_true, y_pred)
#     y_true_sum, y_pred_sum = np.sum(y_true), np.sum(y_pred)
#     # raise_if_not(y_true_sum > 0, 'The series of actual value cannot sum to zero when computing OPE.', logger)
#     return ((y_true_sum - y_pred_sum) / y_true_sum) * 100.

In [18]:
def evaluate_model(
    model_config,
    feature_config,
    missing_config,
    train_features,
    train_target,
    test_features,
    test_target,
):
    ml_model = MLForecast(
        model_config=model_config,
        feature_config=feat_config,
        missing_config=missing_value_config,
    )
    ml_model.fit(train_features, train_target)
    y_pred = ml_model.predict(test_features)
    feat_df = ml_model.feature_importance()
    metrics = calculate_metrics(test_target, y_pred, model_config.name, train_target)
    return y_pred,  metrics, feat_df 


from itertools import cycle


def plot_forecast(pred_df, forecast_columns, forecast_display_names=None):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns) == len(forecast_display_names)
    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = [
        "rgba(" + ",".join([str(c) for c in plotting_utils.hex_to_rgb(c)]) + ",<alpha>)"
        for c in px.colors.qualitative.Plotly
    ]
    act_color = colors[0]
    colors = cycle(colors[1:])
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].energy_consumption,
            mode="lines",
            line=dict(color=act_color.replace("<alpha>", "0.9")),
            name="Actual Consumption",
        )
    )
    for col, display_col in zip(forecast_columns, forecast_display_names):
        fig.add_trace(
            go.Scatter(
                x=pred_df[mask].index,
                y=pred_df.loc[mask, col],
                mode="lines",
                line=dict(dash="dot", color=next(colors).replace("<alpha>", "1")),
                name=display_col,
            )
        )
    return fig

def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

## Linear Models

In [19]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV

### Linear Regression

In [20]:
model_config = ModelConfig(
    model=LinearRegression(),
    name="Linear Regression",
    # LinearRegression is sensitive to normalized data
    normalize=True,
    # LinearRegression cannot handle missing values
    fill_missing=True,
)
with LogTime() as timer:
    y_pred, metrics, feat_df, = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

  df[bfill_columns] = df[bfill_columns].fillna(method="bfill")
  df[ffill_columns] = df[ffill_columns].fillna(method="ffill")
  df[bfill_columns] = df[bfill_columns].fillna(method="bfill")
  df[ffill_columns] = df[ffill_columns].fillna(method="ffill")


Time Elapsed: 0 microseconds


In [21]:
metrics

{'Algorithm': 'Linear Regression',
 'MAE': 0.15949975535573557,
 'MSE': 0.07476760428404579,
 'MASE': 1.2430092668685497,
 'Forecast Bias': 6.05632956526751,
 'Time Elapsed': 0.2116086483001709}

In [22]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter8/lin_reg.png")
fig.show()

In [23]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/lin_reg_fimp.png")
fig.show()

### Ridge Regression (L2)

In [24]:
model_config = ModelConfig(
    model=RidgeCV(), 
    name="Ridge Regression", 
    # RidgeCV is sensitive to normalized data
    normalize=True, 
    # RidgeCV does not handle missing values
    fill_missing=True
)
with LogTime() as timer:
    y_pred,   metrics, feat_df,= evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Time Elapsed: 0 microseconds



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [25]:
metrics

{'Algorithm': 'Ridge Regression',
 'MAE': 0.1594904920376182,
 'MSE': 0.0747586851010404,
 'MASE': 1.2429370762232652,
 'Forecast Bias': 6.042397947745777,
 'Time Elapsed': 0.2828552722930908}

In [26]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/ridge_reg.png")
fig.show()

In [27]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/ridge_reg_fimp.png")
fig.show()

### Lasso Regression (L1)

In [28]:
model_config = ModelConfig(
    model=LassoCV(), 
    name="Lasso Regression", 
    # LassoCV is sensitive to normalized data
    normalize=True, 
    # LassoCV does not handle missing values
    fill_missing=True
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Time Elapsed: 1 second



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [29]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/lasso_reg.png")
fig.show()

In [30]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/lasso_reg_fimp.png")
fig.show()

## Decision Tree

In [31]:
from sklearn.tree import DecisionTreeRegressor

In [32]:
model_config = ModelConfig(
    model=DecisionTreeRegressor(max_depth=4, random_state=42),
    name="Decision Tree",
    # Decision Tree is not affected by normalization
    normalize=False,
    # Decision Tree in scikit-learn does not handle missing values
    fill_missing=True,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Time Elapsed: 0 microseconds



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [33]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/dtree.png")
fig.show()

In [34]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/dtree_fimp.png")
fig.show()

## Bagging and Boosting Trees

### Random Forest

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
model_config = ModelConfig(
    model=RandomForestRegressor(random_state=42, max_depth=4),
    name="Random Forest",
    # RandomForest is not affected by normalization
    normalize=False,
    # RandomForest in scikit-learn does not handle missing values
    fill_missing=True,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Time Elapsed: 31 seconds



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [37]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/rf.png")
fig.show()

In [38]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/rf_fimp.png")
fig.show()

### XGBoost Random Forest

In [39]:
from xgboost import XGBRFRegressor

In [40]:
model_config = ModelConfig(
    model=XGBRFRegressor(random_state=42, max_depth=4),
    name="XGB Random Forest",
    # XGBRF is not affected by normalization
    normalize=False,
    # XGBRF handles missing values
    fill_missing=False,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

Time Elapsed: 0 microseconds


In [41]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/xgbrf.png")
fig.show()

In [42]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/xgbrf_fimp.png")
fig.show()

### LightGBM

In [43]:
from lightgbm import LGBMRegressor

In [44]:
model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="LightGBM",
    # LightGBM is not affected by normalization
    normalize=False,
    # LightGBM handles missing values
    fill_missing=False,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 59
[LightGBM] [Info] Start training from score 0.233927
Time Elapsed: 0 microseconds


In [45]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/lgbm.png")
fig.show()

In [46]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/lgbm_fimp.png")
fig.show()

## Summary

In [47]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", 
                          "MSE": "{:.4f}", 
                          "MASE": "{:.4f}", 
                          "Forecast Bias": "{:.2f}%"})
formatted.highlight_min(color='lightgreen', subset=["MAE","MSE","MASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

Unnamed: 0,Algorithm,forecast_bias,mae,mase,mse,rmse,MAE,MSE,MASE,Forecast Bias,Time Elapsed
0,Naive,-0.026056,0.175333,1.346334,0.104956,0.323968,,,,nan%,
1,SeasonalNaive,-4.796061,0.237662,1.824937,0.170948,0.413459,,,,nan%,
2,Linear Regression,,,,,,0.1595,0.0748,1.243,6.06%,0.211609
3,Ridge Regression,,,,,,0.1595,0.0748,1.2429,6.04%,0.282855
4,Lasso Regression,,,,,,0.1598,0.0743,1.2452,3.77%,1.94179
5,Decision Tree,,,,,,0.1682,0.085,1.3111,9.99%,0.528131
6,Random Forest,,,,,,0.1657,0.082,1.2913,7.79%,31.011072
7,XGB Random Forest,,,,,,0.1641,0.0814,1.2786,8.97%,0.754737
8,LightGBM,,,,,,0.1489,0.0691,1.1601,3.86%,0.807203


# Running ML Forecast for all consumers

Running Lasso Regression, XGB Random Forest, and LightGBM

In [48]:
lcl_ids = sorted(train_df.LCLid.unique())
models_to_run = [
    ModelConfig(
        model=LassoCV(), name="Lasso Regression", normalize=True, fill_missing=True
    ),
    ModelConfig(
        model=XGBRFRegressor(random_state=42, max_depth=4),
        name="XGB Random Forest",
        normalize=False,
        fill_missing=False,
    ),
    ModelConfig(
        model=LGBMRegressor(random_state=42),
        name="LightGBM",
        normalize=False,
        fill_missing=False,
    ),
]

In [49]:
all_preds = []
all_metrics = []

# We can parallelize this loop to run this faster
with LogTime() as timer:
    for lcl_id in tqdm(lcl_ids):
        for model_config in models_to_run:
            model_config = model_config.clone()
            X_train, y_train, _ = feat_config.get_X_y(
                train_df.loc[train_df.LCLid == lcl_id, :],
                categorical=False,
                exogenous=False,
            )
            X_test, y_test, _ = feat_config.get_X_y(
                test_df.loc[test_df.LCLid == lcl_id, :], categorical=False, exogenous=False
            )
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                y_pred, metrics, feat_df = evaluate_model(
                    model_config,
                    feat_config,
                    missing_value_config,
                    X_train,
                    y_train,
                    X_test,
                    y_test,
                )
            y_pred.name = "predictions"
            y_pred = y_pred.to_frame()
            y_pred["LCLid"] = lcl_id
            y_pred["Algorithm"] = model_config.name
            metrics["LCLid"] = lcl_id
            metrics["Algorithm"] = model_config.name
            y_pred["energy_consumption"] = y_test.values
            all_preds.append(y_pred)
            all_metrics.append(metrics)

  0%|          | 0/150 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 59
[LightGBM] [Info] Start training from score 0.123399
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 59
[LightGBM] [Info] Start training from score 0.240939
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8115
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 59
[LightGBM] [Info] Start tra

In [50]:
pred_df = pd.concat(all_preds)
pred_df.head()

Unnamed: 0_level_0,predictions,LCLid,Algorithm,energy_consumption
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01 00:00:00,0.131003,MAC000061,Lasso Regression,0.165
2014-01-01 00:30:00,0.114578,MAC000061,Lasso Regression,0.167
2014-01-01 01:00:00,0.121943,MAC000061,Lasso Regression,0.15
2014-01-01 01:30:00,0.112456,MAC000061,Lasso Regression,0.091
2014-01-01 02:00:00,0.07351,MAC000061,Lasso Regression,0.047


In [51]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df.head()

Unnamed: 0,Algorithm,MAE,MSE,MASE,Forecast Bias,LCLid
0,Lasso Regression,0.033292,0.002745,0.952741,-2.729626,MAC000061
1,XGB Random Forest,0.031758,0.002715,0.908853,-0.786298,MAC000061
2,LightGBM,0.030601,0.002571,0.875727,-1.807954,MAC000061
3,Lasso Regression,0.069267,0.02677,0.919993,1.70956,MAC000062
4,XGB Random Forest,0.067982,0.027023,0.902927,4.041635,MAC000062


# Evaluation of ML Forecast

In [52]:
from src.utils import ts_utils

In [53]:
baseline_aggregate_metrics_df

Unnamed: 0,MAE,MSE,meanMASE,Forecast Bias
Naive,0.088162,0.044981,1.089973,-0.002962
Seasonal Naive,0.129197,0.077654,1.581947,-1.001533


In [54]:
metrics = baseline_aggregate_metrics_df.reset_index().rename(columns={"index":"Algorithm"}).to_dict(orient="records")

In [55]:

for model_config in models_to_run:
    pred_mask = pred_df.Algorithm==model_config.name
    metric_mask = metrics_df.Algorithm==model_config.name
    metrics.append({
    "Algorithm": model_config.name,
    "MAE": ts_utils.mae(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"]),
    "MSE": ts_utils.mse(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"]),
    "meanMASE": metrics_df.loc[metric_mask, "MASE"].mean(),
    "Forecast Bias": ts_utils.forecast_bias_aggregate(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"])
})

In [56]:
agg_metrics_df = pd.DataFrame(metrics)
agg_metrics_df.style.format({"MAE": "{:.4f}", 
                          "MSE": "{:.4f}", 
                          "meanMASE": "{:.4f}", 
                          "Forecast Bias": "{:.2f}%"}).highlight_min(color='lightgreen', subset=["MAE","MSE","meanMASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,Naive,0.0882,0.045,1.09,-0.00%
1,Seasonal Naive,0.1292,0.0777,1.5819,-1.00%
2,Lasso Regression,0.0801,0.0271,1.0051,-0.30%
3,XGB Random Forest,0.0806,0.0306,1.0162,-2.48%
4,LightGBM,0.0771,0.0275,0.9774,0.03%


In [57]:
fig = px.histogram(metrics_df, 
                   x="MASE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=500, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MASE", ylabel="Probability Density", title="Distribution of MASE in the dataset")
fig.update_layout(xaxis_range=[0,2.5])
fig.write_image("imgs/chapter_8/mase_dist.png")
fig.show()

In [58]:
fig = px.histogram(metrics_df, 
                   x="MAE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=100, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MAE", ylabel="Probability Density", title="Distribution of MAE in the dataset")
fig.write_image("imgs/chapter_8/mae_dist.png")
fig.update_layout(xaxis_range=[0,0.4])
fig.show()

In [59]:
fig = px.histogram(metrics_df, 
                   x="MSE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=500, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MSE", ylabel="Probability Density", title="Distribution of MSE in the dataset")
fig.update_layout(xaxis_range=[0,0.3])
fig.write_image("imgs/chapter_8/mse_dist.png")
fig.show()

In [60]:
fig = px.histogram(metrics_df, 
                   x="Forecast Bias", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=250,
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="Forecast Bias", ylabel="Probability Density", title="Distribution of Forecast Bias in the dataset")
fig.update_layout(xaxis_range=[-50,30])
fig.write_image("imgs/chapter_8/bias_dist.png")
fig.show()

# Saving the Baseline Forecasts and Metrics

In [61]:
os.makedirs("data/london_smart_meters/output", exist_ok=True)
output = Path("data/london_smart_meters/output")

In [62]:
pred_df.to_pickle(output/"ml_single_step_prediction_val_df.pkl")
metrics_df.to_pickle(output/"ml_single_step_metrics_val_df.pkl")
agg_metrics_df.to_pickle(output/"ml_single_step_aggregate_metrics_val.pkl")

# Bonus: Using Exogenous Variables

We can run LightGBM, which was our best performing algorithm with exogenous variables

In [63]:
lcl_ids = sorted(train_df.LCLid.unique())
models_to_run = [
    ModelConfig(model = LGBMRegressor(random_state=42), name="LightGBM", normalize=False, fill_missing=False)
]

In [64]:
from sklearn.exceptions import DataConversionWarning

In [65]:
all_preds = []
all_metrics = []
#We can parallelize this loop to run this faster
for lcl_id in tqdm(lcl_ids):
    for model_config in models_to_run:
        model_config = model_config.clone()
        X_train, y_train, _ = feat_config.get_X_y(train_df.loc[train_df.LCLid==lcl_id,:], categorical=False, exogenous=True)
        X_test, y_test, _ = feat_config.get_X_y(test_df.loc[test_df.LCLid==lcl_id,:], categorical=False, exogenous=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # warnings.filterwarnings("ignore",category=DataConversionWarning)
            y_pred, metrics, feat_df = evaluate_model(model_config, feat_config, missing_value_config, X_train, y_train, X_test, y_test)
        y_pred.name = "predictions"
        y_pred = y_pred.to_frame()
        y_pred['LCLid'] = lcl_id
        y_pred['Algorithm'] = model_config.name+"_w_exog"
        metrics["LCLid"] = lcl_id
        metrics["Algorithm"] = model_config.name+"_w_exog"
        y_pred['energy_consumption'] = y_test.values
        all_preds.append(y_pred)
        all_metrics.append(metrics)

  0%|          | 0/150 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9973
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 67
[LightGBM] [Info] Start training from score 0.123399
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9973
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 67
[LightGBM] [Info] Start training from score 0.240939
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9973
[LightGBM] [Info] Number of data points in the train set: 35088, number of used features: 67
[LightGBM] [Info] Start tra

In [66]:
pred_w_ex_df = pd.concat(all_preds)
pred_w_ex_df.head()

Unnamed: 0_level_0,predictions,LCLid,Algorithm,energy_consumption
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01 00:00:00,0.119395,MAC000061,LightGBM_w_exog,0.165
2014-01-01 00:30:00,0.092152,MAC000061,LightGBM_w_exog,0.167
2014-01-01 01:00:00,0.098416,MAC000061,LightGBM_w_exog,0.15
2014-01-01 01:30:00,0.079487,MAC000061,LightGBM_w_exog,0.091
2014-01-01 02:00:00,0.061628,MAC000061,LightGBM_w_exog,0.047


In [67]:
metrics_w_ex_df = pd.DataFrame(all_metrics)
metrics_w_ex_df.head()

Unnamed: 0,Algorithm,MAE,MSE,MASE,Forecast Bias,LCLid
0,LightGBM_w_exog,0.031001,0.002622,0.887196,-2.63658,MAC000061
1,LightGBM_w_exog,0.073679,0.026372,0.978597,-1.7942,MAC000062
2,LightGBM_w_exog,0.039359,0.009795,1.019155,0.618826,MAC000066
3,LightGBM_w_exog,0.104071,0.019422,1.210884,-0.73892,MAC000086
4,LightGBM_w_exog,0.064432,0.016172,0.976878,1.823406,MAC000126


## Evaluation of ML Forecast with Exogenous

In [68]:
from src.utils import ts_utils

In [69]:
metrics = baseline_aggregate_metrics_df.reset_index().rename(columns={"index":"Algorithm"}).to_dict(orient="records")

In [70]:
metrics.append(agg_metrics_df.iloc[4].to_dict())

In [71]:

for model_config in models_to_run:
    pred_mask = pred_w_ex_df.Algorithm==model_config.name+"_w_exog"
    metric_mask = metrics_w_ex_df.Algorithm==model_config.name+"_w_exog"
    metrics.append({
    "Algorithm": model_config.name+"_w_exog",
    "MAE": ts_utils.mae(pred_w_ex_df.loc[pred_mask,"energy_consumption"], pred_w_ex_df.loc[pred_mask,"predictions"]),
    "MSE": ts_utils.mse(pred_w_ex_df.loc[pred_mask,"energy_consumption"], pred_w_ex_df.loc[pred_mask,"predictions"]),
    "meanMASE": metrics_w_ex_df.loc[metric_mask, "MASE"].mean(),
    "Forecast Bias": ts_utils.forecast_bias_aggregate(pred_w_ex_df.loc[pred_mask,"energy_consumption"], pred_w_ex_df.loc[pred_mask,"predictions"])
})

In [72]:
agg_metrics_w_ex_df = pd.DataFrame(metrics)
agg_metrics_w_ex_df.style.format({"MAE": "{:.3f}", 
                          "MSE": "{:.3f}", 
                          "meanMASE": "{:.3f}", 
                          "Forecast Bias": "{:.2f}%"}).highlight_min(color='lightgreen', subset=["MAE","MSE","meanMASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

Unnamed: 0,Algorithm,MAE,MSE,meanMASE,Forecast Bias
0,Naive,0.088,0.045,1.09,-0.00%
1,Seasonal Naive,0.129,0.078,1.582,-1.00%
2,LightGBM,0.077,0.027,0.977,0.03%
3,LightGBM_w_exog,0.077,0.028,0.978,0.04%
