This experiment focus on modelling specifically with **space heating** as the variable

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from skforecast.model_selection import backtesting_forecaster
from skforecast.ForecasterAutoreg import ForecasterAutoreg

from custom_utils import *

import plotly
import plotly.graph_objects as go


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
np.random.seed(90)

In [4]:
data_path = ".././data/"
file_name = "data.csv"
data = pd.read_csv(data_path + file_name)
data = first_preprocess(data)
data.head()

Unnamed: 0_level_0,space_heating,hot_water,sockets,lighting,bld_engcons,car_chargers,weekend,bank_holiday,hour,day_of_week,...,month,year,forecast_temperature,forecast_feelslike,forecast_weathertype,forecast_windspeed,forecast_uvindex,forecast_precipitationprobability,forecast_winddirection,forecast_visibility
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 00:00:00+00:00,3.425193,0.0,0.049433,0.154873,3.629499,0.0,0,1,0,5,...,1,2021,3.0,-1.0,2.0,11.0,0.0,0.0,0.0,5.0
2021-01-01 01:00:00+00:00,4.012907,0.0,0.047753,0.154846,4.215506,0.0,0,1,1,5,...,1,2021,3.0,-1.0,2.0,11.0,0.0,0.0,0.0,5.0
2021-01-01 02:00:00+00:00,5.342417,0.444782,0.051869,0.154997,5.994066,0.0,0,1,2,5,...,1,2021,3.0,-1.0,2.0,11.0,0.0,1.0,0.0,5.0
2021-01-01 03:00:00+00:00,5.756598,0.0,0.043028,0.154626,5.954252,0.0,0,1,3,5,...,1,2021,2.0,-2.0,0.0,11.0,0.0,1.0,337.5,5.0
2021-01-01 04:00:00+00:00,5.415492,0.0,0.050306,0.154498,5.620296,0.0,0,1,4,5,...,1,2021,2.0,-2.0,0.0,11.0,0.0,1.0,337.5,5.0


In [5]:
exo_columns = ['forecast_temperature', 'forecast_feelslike', 'forecast_weathertype',
       'forecast_windspeed', 'forecast_uvindex',
       'forecast_precipitationprobability', 'forecast_winddirection',
       'forecast_visibility']
endo_columns = ['weekend', 'bank_holiday', 'hour', 'day_of_week', 'day_of_month', 'month', 'year']
target_column = ["y"]
column_name = ["space_heating"] + exo_columns + endo_columns + target_column
test_date = '2023-04-01'

# transform the data
sh_data = data.copy()
sh_data["y"] = sh_data[column_name[0]].shift(1)
sh_data = sh_data.dropna(axis=0)
sh_data_scaler, sh_transformed_data = create_std_scaler(sh_data, column_name[:-1])
sh_target_scaler, sh_transformed_target_data = create_std_scaler(sh_data, column_name[-1])

sh_transformed_data = pd.merge(sh_transformed_data, sh_transformed_target_data, left_index=True, 
                               right_index=True)

# # Training and testing data for comms and services
sh_train_data = sh_transformed_data[sh_transformed_data.index < test_date][column_name]
sh_test_data = sh_transformed_data[sh_transformed_data.index >= test_date][column_name]
sh_train_data.shape, sh_test_data.shape

                              columns         mean     variance  scale_factor
0                       space_heating     0.926040     1.421735      1.192365
1                forecast_temperature    11.858021    21.553672      4.642593
2                  forecast_feelslike     9.363488    29.400424      5.422216
3                forecast_weathertype     6.341504    23.529575      4.850729
4                  forecast_windspeed    15.114042    51.377018      7.167776
5                    forecast_uvindex     1.066923     2.560091      1.600028
6   forecast_precipitationprobability    19.762928   740.609101     27.214134
7              forecast_winddirection   199.178534  9256.882536     96.212694
8                 forecast_visibility     4.676822     0.703229      0.838587
9                             weekend     0.287032     0.204644      0.452376
10                       bank_holiday     0.025174     0.024540      0.156654
11                               hour    11.536010    47.970193 

((19114, 17), (6587, 17))

#### Model without features

In [18]:
forecaster = ForecasterAutoreg(regressor=RandomForestRegressor(n_estimators=150, n_jobs=-1, 
                                                               criterion="absolute_error"), lags=24)
forecaster.fit(y=sh_train_data["y"])
forecaster

KeyboardInterrupt: 

In [None]:
metric, predictions = backtesting_forecaster(
                          forecaster         = forecaster,
                          y                  = sh_train_data['y'],
                          steps              = 24,
                          metric             = 'mean_absolute_error',
                          initial_train_size = sh_train_data.shape[0],
                          refit              = False,
                          n_jobs             = 'auto',
                          verbose            = False,
                          show_progress      = True
                      )

print(f"Backtest error (MAE): {metric}")