In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from custom_utils import *

import plotly
import plotly.graph_objects as go


In [2]:
np.random.seed(90)

In [3]:
data_path = ".././data/"
file_name = "data.csv"
data = pd.read_csv(data_path + file_name)
data.head()

Unnamed: 0,time,Comms and Services,Space Heating,Hot Water,Sockets,Lighting,Bld_EngCons,Car Chargers,weekend,bank holiday,...,forecastperiod,forecast_temperature,forecast_feelslike,forecast_weathertype,forecast_windspeed,forecast_uvindex,forecast_precipitationprobability,forecast_winddirection,forecast_visibility,forecast_interval
0,2019-04-01 00:00:00+00,0.515253,0.856489,0.0,0.051467,0.124797,1.548006,0.032239,False,False,...,,,,,,,,,,
1,2019-04-01 01:00:00+00,0.687381,0.786147,0.085386,0.050931,0.151708,1.761553,0.042894,False,False,...,,,,,,,,,,
2,2019-04-01 02:00:00+00,0.687678,3.530669,0.099239,0.055706,0.151233,4.524525,0.0431,False,False,...,,,,,,,,,,
3,2019-04-01 03:00:00+00,0.690139,4.044003,0.098467,0.050019,0.151436,5.034064,0.043131,False,False,...,,,,,,,,,,
4,2019-04-01 04:00:00+00,0.687081,4.223769,0.091533,0.050142,0.151331,5.203856,0.043031,False,False,...,,,,,,,,,,


In [4]:
data = first_preprocess(data)

In [5]:
data["bld_engcons"] = data["bld_engcons"] - data["comms_and_services"]

In [6]:
data = data[["bld_engcons"]]
data.head()

Unnamed: 0_level_0,bld_engcons
time,Unnamed: 1_level_1
2021-01-01 00:00:00+00,3.629499
2021-01-01 01:00:00+00,4.215506
2021-01-01 02:00:00+00,5.994066
2021-01-01 03:00:00+00,5.954252
2021-01-01 04:00:00+00,5.620296


In [7]:
data_scaler, transformed_data = create_std_scaler(data, "bld_engcons")
transformed_data.head()

       columns      mean  variance  scale_factor
0  bld_engcons  1.427945  1.819023      1.348712


Unnamed: 0_level_0,bld_engcons
time,Unnamed: 1_level_1
2021-01-01 00:00:00+00,1.632338
2021-01-01 01:00:00+00,2.066833
2021-01-01 02:00:00+00,3.385543
2021-01-01 03:00:00+00,3.356023
2021-01-01 04:00:00+00,3.108412


In [8]:
transformed_data = transformed_data.rename(columns={"bld_engcons":  "x"})
transformed_data["x-1"] = transformed_data["x"].shift(-1)
transformed_data["x-2"] = transformed_data["x"].shift(-2)
transformed_data["x-3"] = transformed_data["x"].shift(-3)
transformed_data["x-4"] = transformed_data["x"].shift(-4)
transformed_data["x-5"] = transformed_data["x"].shift(-5)
transformed_data["y"] = transformed_data["x"].shift(-6)

In [9]:
transformed_data = transformed_data.dropna()
transformed_data.isna().sum()

x      0
x-1    0
x-2    0
x-3    0
x-4    0
x-5    0
y      0
dtype: int64

In [10]:
train_data = transformed_data[transformed_data.index < "2023-04-01"]
test_data = transformed_data[transformed_data.index >= "2023-04-01"]

train_data.shape, test_data.shape

((18979, 7), (6542, 7))

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
column_name = train_data.columns.tolist()

In [None]:
rf_reg = RandomForestRegressor(n_estimators=250, criterion="absolute_error")
rf_reg_fit = rf_reg.fit(train_data[column_name[:-1]], train_data[column_name[-1]])

In [None]:
test_data["predictions"] = rf_reg_fit.predict(test_data[column_name[:-1]])
test_data["predictions"].head()

In [None]:
pred_cols = ["predictions", "y"]
plot_data = test_data[pred_cols]
plot_data[pred_cols] = data_scaler.inverse_transform(plot_data[pred_cols])

In [None]:
cal_metrics(plot_data["y"].to_numpy(), plot_data["predictions"].to_numpy())

In [None]:
plot_data = plot_data.reset_index(drop=0)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=plot_data["time"].to_numpy(), y=plot_data["y"].to_numpy(),
                    mode='lines+markers',
                    name='actual'))
fig.add_trace(go.Scatter(x=plot_data["time"].to_numpy(), y=plot_data["predictions"].to_numpy(),
                    mode='lines+markers',
                    name='linear regression predictions'))
fig.update_layout(title_text="Composite model  with lag variables : Test data")
fig.show()