##### Library import

In [81]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import numpy as np

In [82]:
df = pd.read_csv('../data/clean/processed_data.csv')
tscv = TimeSeriesSplit(n_splits=10)

##### Linear Regression

In [83]:
df_lr = df.copy()
y = df_lr['energy_demand']
X = df_lr
X['date'] = pd.to_datetime(X['date'])
X['month'] = X['date'].dt.month
X['day'] = X['date'].dt.day
X['hour'] = X['date'].dt.hour
X = X.drop(['energy_demand', 'date', 'is_weekend_False'], axis = 1).astype(float)
metrics = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = sm.add_constant(X_train, has_constant='add')
    X_test = sm.add_constant(X_test, has_constant='add')
    
    model = sm.OLS(y_train, X_train).fit()
    y_pred = model.predict(X_test)

    metrics.append({
        'RMSE': root_mean_squared_error(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'MAPE': mean_absolute_percentage_error(y_test, y_pred)
    })

df_lr = pd.DataFrame(metrics).mean()
print(model.summary())
df_lr

                            OLS Regression Results                            
Dep. Variable:          energy_demand   R-squared:                       0.970
Model:                            OLS   Adj. R-squared:                  0.970
Method:                 Least Squares   F-statistic:                 2.235e+04
Date:                Sat, 17 Jan 2026   Prob (F-statistic):               0.00
Time:                        18:21:55   Log-Likelihood:            -1.3390e+05
No. Observations:               15950   AIC:                         2.679e+05
Df Residuals:                   15926   BIC:                         2.680e+05
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

RMSE    3782.177626
MAE     3044.660679
MAPE       0.076719
dtype: float64

In [84]:
vif = pd.DataFrame()
vif['variable'] = X.columns
vif['VIF'] = [
    variance_inflation_factor(X.values, i)
    for i in range(X.shape[1])
]

vif.sort_values('VIF', ascending=False)

Unnamed: 0,variable,VIF
0,atmospheric_pressure_sea_level,2738.000133
2,air_temp_dry_bulb,1671.429711
4,relative_humidity,1541.586683
3,dew_point_temp,950.089209
11,lag_24h,347.802815
8,lag_1h,303.873666
12,lag_48h,208.502291
10,lag_12h,165.410418
9,lag_6h,146.670338
14,lag_192h,133.741491


##### SARIMAX

In [1]:
df_sm = df.copy()
model = SARIMAX(df, order=(p, d, q), seasonal_order=seasonal_order, exog=exogenous_data)

NameError: name 'df' is not defined