In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from datetime import datetime
from sklearn.metrics import mean_squared_error, r2_score
import shap
import seaborn as sns 
import matplotlib.pyplot as plt

# Assuming df is your DataFrame after aggregating to daily data and merging with weather/building data
df = pd.read_csv("Data.csv")# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# Get the number of rows and columns 
# rows = len(df.axes[0]) 
# cols = len(df.axes[1]) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41666 entries, 0 to 41665
Data columns (total 34 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   time                               41664 non-null  object 
 1   Comms and Services                 40812 non-null  float64
 2   Car Chargers                       40812 non-null  float64
 3   Space Heating                      40812 non-null  float64
 4   Hot Water                          40812 non-null  float64
 5   Sockets                            40812 non-null  float64
 6   Lighting                           40812 non-null  float64
 7   total_energy                       41664 non-null  object 
 8   datepart                           41664 non-null  float64
 9   weekend                            41664 non-null  object 
 10  bank holiday                       41664 non-null  object 
 11  hour                               41664 non-null  flo


Columns (9,10) have mixed types. Specify dtype option on import or set low_memory=False.



In [64]:
# Generate a list of columns to drop
columns_to_drop = [col for col in df.columns if col.startswith('observation')]

# Drop these columns from the DataFrame
df = df.drop(columns=columns_to_drop)
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40131 entries, 112 to 41663
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   time                               40131 non-null  object 
 1   Comms and Services                 40131 non-null  float64
 2   Car Chargers                       40131 non-null  float64
 3   Space Heating                      40131 non-null  float64
 4   Hot Water                          40131 non-null  float64
 5   Sockets                            40131 non-null  float64
 6   Lighting                           40131 non-null  float64
 7   total_energy                       40131 non-null  object 
 8   datepart                           40131 non-null  float64
 9   weekend                            40131 non-null  object 
 10  bank holiday                       40131 non-null  object 
 11  hour                               40131 non-null  float6

In [65]:
#Defining Function for turning features into cyclic 
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [66]:


# df['month'] = df.datetime.dt.month
df = encode(df, 'month', 12)
df =  encode(df, 'hour', 24)
# df['day'] = df.datetime.dt.day
# df = encode(df, 'day', 31)
df = encode(df, 'day of week', 7)
df = encode(df, 'day of month', 31)
df['working_hours'] = df['hour'].apply(lambda x: 8 <= x <= 17)
df['bank holiday'] = df['bank holiday'].astype(int)
df['weekend'] = df['weekend'].astype(int)


In [67]:
columns_to_convert = ['weekend','bank holiday']

# Convert 0 and 1 to boolean values in specified columns
df[columns_to_convert] = df[columns_to_convert].astype(bool)
# Sum specified columns to create 'total_aob_energy'
df['total_aob_energy'] = df[[ 'Hot Water', 'Sockets', 'Lighting','Comms and Services', 'Space Heating']].sum(axis=1, skipna=True)

# Drop the original columns except 'Car Chargers'
df.drop(['Comms and Services', 'Space Heating', 'Hot Water', 'Sockets', 'Lighting','day of week','day of month','hour','month' ], axis=1, inplace=True)

# Now, 'df' contains the new 'total_aob_energy' column and has the specified columns dropped, except 'Car Chargers'

In [68]:
df['time'] = pd.to_datetime(df['time'])

In [69]:
df = df.dropna()
df = df.select_dtypes(exclude=['object'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40131 entries, 112 to 41663
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype              
---  ------                             --------------  -----              
 0   time                               40131 non-null  datetime64[ns, UTC]
 1   Car Chargers                       40131 non-null  float64            
 2   datepart                           40131 non-null  float64            
 3   weekend                            40131 non-null  bool               
 4   bank holiday                       40131 non-null  bool               
 5   year                               40131 non-null  float64            
 6   forecast_temperature               40131 non-null  float64            
 7   forecast_feelslike                 40131 non-null  float64            
 8   forecast_weathertype               40131 non-null  float64            
 9   forecast_windspeed                 40131 non-null  fl

In [70]:
df.columns = [col.capitalize() for col in df.columns]
print(df.columns)
df.info()

Index(['Time', 'Car chargers', 'Datepart', 'Weekend', 'Bank holiday', 'Year',
       'Forecast_temperature', 'Forecast_feelslike', 'Forecast_weathertype',
       'Forecast_windspeed', 'Forecast_uvindex',
       'Forecast_precipitationprobability', 'Month_sin', 'Month_cos',
       'Hour_sin', 'Hour_cos', 'Day of week_sin', 'Day of week_cos',
       'Day of month_sin', 'Day of month_cos', 'Working_hours',
       'Total_aob_energy'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 40131 entries, 112 to 41663
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype              
---  ------                             --------------  -----              
 0   Time                               40131 non-null  datetime64[ns, UTC]
 1   Car chargers                       40131 non-null  float64            
 2   Datepart                           40131 non-null  float64            
 3   Weekend                            40131 non-null  b

In [71]:
df = df.drop(['Car chargers','Datepart'], axis = 1)

In [72]:
new_columns = []
for col in df.columns:
    # Remove "Forecast_" if it exists and capitalize the first letter of the remaining string
    if col.startswith('Forecast_'):
        new_name = col.replace('Forecast_', '')  # Remove 'Forecast_'
        new_name = new_name.capitalize()  # Capitalize the first letter
    else:
        new_name = col  # Keep the original name if it doesn't start with 'Forecast_'
    new_columns.append(new_name)

# Assign the modified column names back to the DataFrame
df.columns = new_columns


In [73]:
# Function to calculate the replacement value for zeros
# def replace_zero_with_rolling_average(series, window_size=24):
#     # Calculate rolling average, including the current row
#     rolling_avg = series.replace(0, np.nan).rolling(window=window_size*2 + 1, center=True, min_periods=1).mean()

#     # Replace zeros with the calculated rolling averages
#     return series.mask(series == 0, rolling_avg)

# # Apply the function to the column
# df['Total_aob_energy'] = replace_zero_with_rolling_average(df['Total_aob_energy'])



In [74]:
zero_rows = df[df['Total_aob_energy'] == 0]

# Display those rows
print(zero_rows)

                          Time  Weekend  Bank holiday    Year  Temperature  \
5995 2019-12-06 19:00:00+00:00    False         False  2019.0         10.0   
9360 2020-04-25 00:00:00+00:00     True         False  2020.0         11.0   

      Feelslike  Weathertype  Windspeed  Uvindex  Precipitationprobability  \
5995        7.0          8.0       22.0      0.0                      16.0   
9360       11.0          0.0        4.0      0.0                       0.0   

         Month_sin  Month_cos  Hour_sin  Hour_cos  Day of week_sin  \
5995 -2.449294e-16        1.0 -0.965926  0.258819        -0.974928   
9360  8.660254e-01       -0.5  0.000000  1.000000        -0.781831   

      Day of week_cos  Day of month_sin  Day of month_cos  Working_hours  \
5995        -0.222521          0.937752          0.347305          False   
9360         0.623490         -0.937752          0.347305          False   

      Total_aob_energy  
5995               0.0  
9360               0.0  


In [75]:
# Set 'date' column as the DataFrame index
df.set_index('Time', inplace=True)

In [76]:
df = df[df.index >= "2021-01-01"]

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 25470 entries, 2021-01-01 00:00:00+00:00 to 2023-12-31 23:00:00+00:00
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Weekend                   25470 non-null  bool   
 1   Bank holiday              25470 non-null  bool   
 2   Year                      25470 non-null  float64
 3   Temperature               25470 non-null  float64
 4   Feelslike                 25470 non-null  float64
 5   Weathertype               25470 non-null  float64
 6   Windspeed                 25470 non-null  float64
 7   Uvindex                   25470 non-null  float64
 8   Precipitationprobability  25470 non-null  float64
 9   Month_sin                 25470 non-null  float64
 10  Month_cos                 25470 non-null  float64
 11  Hour_sin                  25470 non-null  float64
 12  Hour_cos                  25470 non-null  float64
 13  Day of week_si

In [78]:
# # Split data into features and target
# X = df.drop(['Total_aob_energy','Datepart','Car chargers'], axis=1)  # Assuming 'energy_consumption' is the target variable
# y = df['Total_aob_energy']
# # X = df[['hour_sin','hour_cos','Car Chargers','forecast_temperature','forecast_feelslike','forecast_weathertype','forecast_precipitationprobability','forecast_windspeed', 'forecast_uvindex','observation_temperature','observation_windspeed','observation_pressure','hour' , 'day of week','day of month' ,'month_sin','month_cos',]] 
# # X = df[['forecast_temperature','forecast_feelslike','forecast_weathertype','forecast_precipitationprobability','forecast_windspeed', 'forecast_uvindex','observation_temperature','observation_windspeed','observation_pressure','hour' , 'day of week','day of month' ,'month_sin','month_cos',]] 
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
# # Model Training
# # Linear Regression
# lr_model = LinearRegression()
# lr_model.fit(X_train, y_train)

# # Random Forest
# rf_model = RandomForestRegressor(n_estimators=100)
# rf_model.fit(X_train, y_train)

# #gradient-boosted regression model
# gbr_model = GradientBoostingRegressor()
# gbr_model.fit(X_train, y_train)

# #Ridge Model 
# ridge_model = Ridge()
# ridge_model.fit(X_train, y_train)

# #Lasso Model
# lasso_model = Lasso()
# lasso_model.fit(X_train, y_train)

# #Support Vector Regression 
# svr_model = SVR()
# svr_model.fit(X_train, y_train)

# # Model Evaluation
# y_pred_lr = lr_model.predict(X_test)
# y_pred_rf = rf_model.predict(X_test)
# y_pred_gbr = gbr_model.predict(X_test)
# y_pred_ridge = ridge_model.predict(X_test)
# y_pred_lasso = lasso_model.predict(X_test)
# y_pred_svr = svr_model.predict(X_test)


# #https://stats.stackexchange.com/questions/255276/normalized-root-mean-square-error-nrmse-with-zero-mean-of-observed-value
# #https://stackoverflow.com/questions/17197492/is-there-a-library-function-for-root-mean-square-error-rmse-in-python
# # Calculate range of the target variable
# y_range = np.max(y_test) - np.min(y_test)
# # Calculate the mean of the target variable
# y_mean = np.mean(y_test)
# # Calculate RMSE for Linear Regression
# lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
# lr_rmse = mean_squared_error(y_test, y_pred_lr, squared=False)

# # Calculate Normalised RMSE for Linear Regression
# #https://www.marinedatascience.co/blog/2019/01/07/normalizing-the-rmse/
# lr_nrmse = lr_rmse / y_range
# # Calculate rRMSE (relative RMSE) for Linear Regression, normalized by the mean of y_test
# # https://search.r-project.org/CRAN/refmans/ehaGoF/html/gofRRMSE.html
# lr_rrmse = lr_rmse / y_mean
# # Calculate MAE
# lr_mae = np.mean(np.abs(y_test - y_pred_lr))
# print("Mean Absolute Error (MAE) for Linear Regression:", lr_mae)
# # Calculate MAPE
# # Note: We add a small number to the denominator to avoid division by zero in case y_test contains zeros.
# lr_mape = np.mean(np.abs((y_test - y_pred_lr) / (y_test + 1e-10))) * 100
# print("Mean Absolute Percentage Error (MAPE) for Linear Regression:", lr_mape, "%")

# print("Linear Regression RMSE:", lr_rmse)
# print("Linear Regression NRMSE:", lr_nrmse)
# print("Linear Regression rRMSE:", lr_rrmse)



# # Calculate normalised RMSE for Random Forest
# rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
# # Calculate Normalised RMSE for Random Forest
# rf_nrmse = rf_rmse / y_range
# # Calculate rRMSE (relative RMSE) for Random Forest, normalised by the mean of y_test
# rf_rrmse = rf_rmse / y_mean
# # Calculate MAE
# rf_mae = np.mean(np.abs(y_test - y_pred_rf))
# print("Mean Absolute Error (MAE) for Random Forest:", rf_mae)
# # Calculate MAPE
# # Note: We add a small number to the denominator to avoid division by zero in case y_test contains zeros.
# rf_mape = np.mean(np.abs((y_test - y_pred_rf) / (y_test + 1e-10))) * 100
# print("Mean Absolute Percentage Error (MAPE) for Random Forest:", rf_mape, "%")
# print("Random Forest RMSE:", rf_rmse)
# print("Random Forest NRMSE:", rf_nrmse)
# print("Random Forest rRMSE:", rf_rrmse)



# # Calculate nomrlaised RMSE for Gradient-boosted Regression
# gbr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_gbr))
# # Calculate Normalised RMSE for Gradient-boosted regression
# gbr_nrmse = gbr_rmse / y_range
# # Calculate rRMSE (relative RMSE) for Gradient boosting, normalised by the mean of y_test
# gbr_rrmse = gbr_rmse / y_mean
# # Calculate MAE
# gbr_mae = np.mean(np.abs(y_test - y_pred_gbr))
# print("Mean Absolute Error (MAE) for Gradient Boosting Regression:", gbr_mae)

# # Calculate MAPE
# # Note: We add a small number to the denominator to avoid division by zero in case y_test contains zeros.
# gbr_mape = np.mean(np.abs((y_test - y_pred_gbr) / (y_test + 1e-10))) * 100
# print("Mean Absolute Percentage Error (MAPE) for Gradient Boosting Regression:", gbr_mape, "%")
# print("Gradient Boosting Regression RMSE:", gbr_rmse)
# print("Gradient Boosting Regression NRMSE:", gbr_nrmse)
# print("Gradient Boosting Regression rRMSE:", gbr_rrmse)

# # Calculate nomrlaised RMSE for Ridge
# ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
# # Calculate Normalised RMSE for Ridge regression
# ridge_nrmse = ridge_rmse / y_range
# # Calculate rRMSE (relative RMSE) for Ridge, normalised by the mean of y_test
# ridge_rrmse = ridge_rmse / y_mean
# # Calculate MAE
# ridge_mae = np.mean(np.abs(y_test - y_pred_ridge))
# print("Mean Absolute Error (MAE) for Ridge:", ridge_mae)

# # Calculate MAPE
# # Note: We add a small number to the denominator to avoid division by zero in case y_test contains zeros.
# ridge_mape = np.mean(np.abs((y_test - y_pred_ridge) / (y_test + 1e-10))) * 100
# print("Mean Absolute Percentage Error (MAPE) for Ridge:", ridge_mape, "%")
# print("Ridge RMSE:", ridge_rmse)
# print("Ridge NRMSE:", ridge_nrmse)
# print("Ridge rRMSE:", ridge_rrmse)



# # Calculate nomrlaised RMSE for Lasso
# lasso_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
# # Calculate Normalised RMSE for Lasso regression
# lasso_nrmse = lasso_rmse / y_range
# # Calculate rRMSE (relative RMSE) for Lasso, normalised by the mean of y_test
# lasso_rrmse = lasso_rmse / y_mean
# # Calculate MAE
# lasso_mae = np.mean(np.abs(y_test - y_pred_lasso))
# print("Mean Absolute Error (MAE) for Lasso:", lasso_mae)

# # Calculate MAPE
# # Note: We add a small number to the denominator to avoid division by zero in case y_test contains zeros.
# lasso_mape = np.mean(np.abs((y_test - y_pred_lasso) / (y_test + 1e-10))) * 100
# print("Mean Absolute Percentage Error (MAPE) for Lasso:", lasso_mape, "%")
# print("Lasso RMSE:", lasso_rmse)
# print("Lasso NRMSE:", lasso_nrmse)
# print("Lasso rRMSE:", lasso_rrmse)

# # Calculate nomrlaised RMSE for support vector regression
# svr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_svr))
# # Calculate Normalised RMSE for support vector regression regression
# svr_nrmse = svr_rmse / y_range
# # Calculate rRMSE (relative RMSE) for support vector regression, normalised by the mean of y_test
# svr_rrmse = svr_rmse / y_mean
# # Calculate MAE
# svr_mae = np.mean(np.abs(y_test - y_pred_svr))
# print("Mean Absolute Error (MAE) for SVR:", svr_mae)

# # Calculate MAPE
# # Note: We add a small number to the denominator to avoid division by zero in case y_test contains zeros.
# svr_mape = np.mean(np.abs((y_test - y_pred_svr) / (y_test + 1e-10))) * 100
# print("Mean Absolute Percentage Error (MAPE) for SVR:", svr_mape, "%")
# print("SVR RMSE:", svr_rmse)
# print("SVR NRMSE:", svr_nrmse)
# print("SVR rRMSE:", svr_rrmse)



In [79]:
# from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# # Split data into features and target
# X = df.drop(['Total_aob_energy','Datepart','Car chargers'], axis=1)
# y = df['Total_aob_energy']

# # Split the data into training and testing sets using time series split
# tscv = TimeSeriesSplit(n_splits=5)
# train_indices, test_indices = list(tscv.split(X))[-1]
# X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
# y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# # Model Training
# # Linear Regression
# lr_model = LinearRegression()
# lr_model.fit(X_train, y_train)

# # Random Forest
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# # Gradient-boosted regression model
# gbr_model = GradientBoostingRegressor(random_state=42)
# gbr_model.fit(X_train, y_train)

# # Ridge Model
# ridge_model = Ridge()
# ridge_model.fit(X_train, y_train)

# # Lasso Model
# lasso_model = Lasso()
# lasso_model.fit(X_train, y_train)

# # Support Vector Regression
# svr_model = SVR()
# svr_model.fit(X_train, y_train)

# # Model Evaluation
# models = {
#     "Linear Regression": lr_model,
#     "Random Forest": rf_model,
#     "Gradient Boosting": gbr_model,
#     "Ridge": ridge_model,
#     "Lasso": lasso_model,
#     "SVR": svr_model
# }

# for model_name, model in models.items():
#     # Predictions
#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)

#     # Training Errors
#     train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
#     train_mae = mean_absolute_error(y_train, y_train_pred)
#     train_mape = np.mean(np.abs((y_train - y_train_pred) / (y_train + 1e-10))) * 100

#     # Testing Errors
#     test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
#     test_mae = mean_absolute_error(y_test, y_test_pred)
#     test_mape = np.mean(np.abs((y_test - y_test_pred) / (y_test + 1e-10))) * 100

#     # Normalized RMSE
#     y_range = np.max(y_test) - np.min(y_test)
#     y_mean = np.mean(y_test)
#     test_nrmse = test_rmse / y_range
#     test_rrmse = test_rmse / y_mean

#     print(f"{model_name}:")
#     print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
#     print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
#     print(f"  Train MAPE: {train_mape:.2f}%, Test MAPE: {test_mape:.2f}%")
#     print(f"  Test NRMSE: {test_nrmse:.4f}, Test rRMSE: {test_rrmse:.4f}")


In [80]:
# # Hyperparameter Tuning for Random Forest
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=tscv, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_search_rf.fit(X_train, y_train)

# print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# # Evaluate the best model
# best_rf_model = grid_search_rf.best_estimator_
# y_train_pred_rf_best = best_rf_model.predict(X_train)
# y_test_pred_rf_best = best_rf_model.predict(X_test)

# # Best Random Forest Training Errors
# best_rf_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf_best))
# best_rf_train_mae = mean_absolute_error(y_train, y_train_pred_rf_best)
# best_rf_train_mape = np.mean(np.abs((y_train - y_train_pred_rf_best) / (y_train + 1e-10))) * 100

# # Best Random Forest Testing Errors
# best_rf_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_rf_best))
# best_rf_test_mae = mean_absolute_error(y_test, y_test_pred_rf_best)
# best_rf_test_mape = np.mean(np.abs((y_test - y_test_pred_rf_best) / (y_test + 1e-10))) * 100

# print("Best Random Forest Model:")
# print(f"  Train RMSE: {best_rf_train_rmse:.4f}, Test RMSE: {best_rf_test_rmse:.4f}")
# print(f"  Train MAE: {best_rf_train_mae:.4f}, Test MAE: {best_rf_test_mae:.4f}")
# print(f"  Train MAPE: {best_rf_train_mape:.2f}%, Test MAPE: {best_rf_test_mape:.2f}%")

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 25470 entries, 2021-01-01 00:00:00+00:00 to 2023-12-31 23:00:00+00:00
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Weekend                   25470 non-null  bool   
 1   Bank holiday              25470 non-null  bool   
 2   Year                      25470 non-null  float64
 3   Temperature               25470 non-null  float64
 4   Feelslike                 25470 non-null  float64
 5   Weathertype               25470 non-null  float64
 6   Windspeed                 25470 non-null  float64
 7   Uvindex                   25470 non-null  float64
 8   Precipitationprobability  25470 non-null  float64
 9   Month_sin                 25470 non-null  float64
 10  Month_cos                 25470 non-null  float64
 11  Hour_sin                  25470 non-null  float64
 12  Hour_cos                  25470 non-null  float64
 13  Day of week_si

In [82]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # Sample data loading (replace with actual data loading)
# # df = pd.read_csv('electricity_consumption.csv', parse_dates=['Date'], index_col='Date')
# # Assuming df has a DateTime index and a column named 'Total_aob_energy' for electricity consumption

# # Split data into features and target
# X = df.drop(['Total_aob_energy'], axis=1)
# y = df['Total_aob_energy']

# # Split the data into training and testing sets using time series split
# tscv = TimeSeriesSplit(n_splits=5)
# train_indices, test_indices = list(tscv.split(X))[-1]
# X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
# y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# # Model Training
# # Linear Regression
# lr_model = LinearRegression()
# lr_model.fit(X_train, y_train)

# # Random Forest
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# # Gradient-boosted regression model
# gbr_model = GradientBoostingRegressor(random_state=42)
# gbr_model.fit(X_train, y_train)

# # Ridge Model
# ridge_model = Ridge()
# ridge_model.fit(X_train, y_train)

# # Lasso Model
# lasso_model = Lasso()
# lasso_model.fit(X_train, y_train)

# # Support Vector Regression
# svr_model = SVR()
# svr_model.fit(X_train, y_train)

# # Model Evaluation
# models = {
#     "Linear Regression": lr_model,
#     "Random Forest": rf_model,
#     "Gradient Boosting": gbr_model,
#     "Ridge": ridge_model,
#     "Lasso": lasso_model,
#     "SVR": svr_model
# }

# def smape(y_true, y_pred):
#     return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

# for model_name, model in models.items():
#     # Predictions
#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)

#     # Training Errors
#     train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
#     train_mae = mean_absolute_error(y_train, y_train_pred)
#     train_mape = np.mean(np.abs((y_train - y_train_pred) / (y_train + 1e-10))) * 100
#     train_r2 = r2_score(y_train, y_train_pred)
#     train_smape = smape(y_train, y_train_pred)

#     # Testing Errors
#     test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
#     test_mae = mean_absolute_error(y_test, y_test_pred)
#     test_mape = np.mean(np.abs((y_test - y_test_pred) / (y_test + 1e-10))) * 100
#     test_r2 = r2_score(y_test, y_test_pred)
#     test_smape = smape(y_test, y_test_pred)

#     # Normalized RMSE
#     y_range = np.max(y_test) - np.min(y_test)
#     y_mean = np.mean(y_test)
#     test_nrmse = test_rmse / y_range
#     test_rrmse = test_rmse / y_mean

#     print(f"{model_name}:")
#     print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
#     print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
#     print(f"  Train MAPE: {train_mape:.2f}%, Test MAPE: {test_mape:.2f}%")
#     print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
#     print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
#     print(f"  Test NRMSE: {test_nrmse:.4f}, Test rRMSE: {test_rrmse:.4f}")

# # Hyperparameter Tuning for Random Forest
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=tscv, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_search_rf.fit(X_train, y_train)

# print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# # Evaluate the best model
# best_rf_model = grid_search_rf.best_estimator_
# y_train_pred_rf_best = best_rf_model.predict(X_train)
# y_test_pred_rf_best = best_rf_model.predict(X_test)

# # Best Random Forest Training Errors
# best_rf_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf_best))
# best_rf_train_mae = mean_absolute_error(y_train, y_train_pred_rf_best)
# best_rf_train_mape = np.mean(np.abs((y_train - y_train_pred_rf_best) / (y_train + 1e-10))) * 100
# best_rf_train_r2 = r2_score(y_train, y_train_pred_rf_best)
# best_rf_train_smape = smape(y_train, y_train_pred_rf_best)

# # Best Random Forest Testing Errors
# best_rf_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_rf_best))
# best_rf_test_mae = mean_absolute_error(y_test, y_test_pred_rf_best)
# best_rf_test_mape = np.mean(np.abs((y_test - y_test_pred_rf_best) / (y_test + 1e-10))) * 100
# best_rf_test_r2 = r2_score(y_test, y_test_pred_rf_best)
# best_rf_test_smape = smape(y_test, y_test_pred_rf_best)

# print("Best Random Forest Model:")
# print(f"  Train RMSE: {best_rf_train_rmse:.4f}, Test RMSE: {best_rf_test_rmse:.4f}")
# print(f"  Train MAE: {best_rf_train_mae:.4f}, Test MAE: {best_rf_test_mae:.4f}")
# print(f"  Train MAPE: {best_rf_train_mape:.2f}%, Test MAPE: {best_rf_test_mape:.2f}%")
# print(f"  Train R^2: {best_rf_train_r2:.4f}, Test R^2: {best_rf_test_r2:.4f}")
# print(f"  Train SMAPE: {best_rf_train_smape:.2f}%, Test SMAPE: {best_rf_test_smape:.2f}%")


In [83]:
# import pandas as pd
# import numpy as np
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.svm import SVR
# from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# # Sample data creation (replace with actual data)
# np.random.seed(42)
# date_range = pd.date_range(start='1/1/2021', periods=500, freq='H')
# df = pd.DataFrame(data={'Total_aob_energy': np.random.rand(500)}, index=date_range)

# # Feature Engineering: Create lag features
# for lag in range(1, 25):
#     df[f'lag_{lag}'] = df['Total_aob_energy'].shift(lag)

# # Feature Engineering: Create rolling window features
# df['rolling_mean_24'] = df['Total_aob_energy'].rolling(window=24).mean()
# df['rolling_std_24'] = df['Total_aob_energy'].rolling(window=24).std()

# # Drop rows with NaN values created by lag and rolling features
# df.dropna(inplace=True)

# # Split data into features and target
# X = df.drop(['Total_aob_energy'], axis=1)
# y = df['Total_aob_energy']

# # Split the data into training and testing sets using time series split
# tscv = TimeSeriesSplit(n_splits=5)
# train_indices, test_indices = list(tscv.split(X))[-1]
# X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
# y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# # Normalize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Apply PCA for dimensionality reduction
# pca = PCA(n_components=0.95)  # Retain 95% of the variance
# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)

# # Model Training
# models = {
#     "Linear Regression": LinearRegression(),
#     "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
#     "Gradient Boosting": GradientBoostingRegressor(random_state=42),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "SVR": SVR()
# }

# for model_name, model in models.items():
#     model.fit(X_train_pca, y_train)
    
#     # Predictions
#     y_train_pred = model.predict(X_train_pca)
#     y_test_pred = model.predict(X_test_pca)

#     # Training Errors
#     train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
#     train_mae = mean_absolute_error(y_train, y_train_pred)
#     train_mape = np.mean(np.abs((y_train - y_train_pred) / (y_train + 1e-10))) * 100
#     train_r2 = r2_score(y_train, y_train_pred)
#     train_smape = np.mean(2 * np.abs(y_train_pred - y_train) / (np.abs(y_train_pred) + np.abs(y_train) + 1e-10)) * 100

#     # Testing Errors
#     test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
#     test_mae = mean_absolute_error(y_test, y_test_pred)
#     test_mape = np.mean(np.abs((y_test - y_test_pred) / (y_test + 1e-10))) * 100
#     test_r2 = r2_score(y_test, y_test_pred)
#     test_smape = np.mean(2 * np.abs(y_test_pred - y_test) / (np.abs(y_test_pred) + np.abs(y_test) + 1e-10)) * 100

#     # Normalized RMSE
#     y_range = np.max(y_test) - np.min(y_test)
#     y_mean = np.mean(y_test)
#     test_nrmse = test_rmse / y_range
#     test_rrmse = test_rmse / y_mean

#     print(f"{model_name}:")
#     print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
#     print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
#     print(f"  Train MAPE: {train_mape:.2f}%, Test MAPE: {test_mape:.2f}%")
#     print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
#     print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
#     print(f"  Test NRMSE: {test_nrmse:.4f}, Test rRMSE: {test_rrmse:.4f}")

# # Hyperparameter Tuning for Random Forest
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid_rf, cv=tscv, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_search_rf.fit(X_train_pca, y_train)

# print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# # Evaluate the best model
# best_rf_model = grid_search_rf.best_estimator_
# y_train_pred_rf_best = best_rf_model.predict(X_train_pca)
# y_test_pred_rf_best = best_rf_model.predict(X_test_pca)

# # Best Random Forest Training Errors
# best_rf_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf_best))
# best_rf_train_mae = mean_absolute_error(y_train, y_train_pred_rf_best)
# best_rf_train_mape = np.mean(np.abs((y_train - y_train_pred_rf_best) / (y_train + 1e-10))) * 100
# best_rf_train_r2 = r2_score(y_train, y_train_pred_rf_best)
# best_rf_train_smape = np.mean(2 * np.abs(y_train_pred_rf_best - y_train) / (np.abs(y_train_pred_rf_best) + np.abs(y_train) + 1e-10)) * 100

# # Best Random Forest Testing Errors
# best_rf_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_rf_best))
# best_rf_test_mae = mean_absolute_error(y_test, y_test_pred_rf_best)
# best_rf_test_mape = np.mean(np.abs((y_test - y_test_pred_rf_best) / (y_test + 1e-10))) * 100
# best_rf_test_r2 = r2_score(y_test, y_test_pred_rf_best)
# best_rf_test_smape = np.mean(2 * np.abs(y_test_pred_rf_best - y_test) / (np.abs(y_test_pred_rf_best) + np.abs(y_test) + 1e-10)) * 100

# print("Best Random Forest Model:")
# print(f"  Train RMSE: {best_rf_train_rmse:.4f}, Test RMSE: {best_rf_test_rmse:.4f}")
# print(f"  Train MAE: {best_rf_train_mae:.4f}, Test MAE: {best_rf_test_mae:.4f}")
# print(f"  Train MAPE: {best_rf_train_mape:.2f}%, Test MAPE: {best_rf_test_mape:.2f}%")
# print(f"  Train R^2")

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 25470 entries, 2021-01-01 00:00:00+00:00 to 2023-12-31 23:00:00+00:00
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Weekend                   25470 non-null  bool   
 1   Bank holiday              25470 non-null  bool   
 2   Year                      25470 non-null  float64
 3   Temperature               25470 non-null  float64
 4   Feelslike                 25470 non-null  float64
 5   Weathertype               25470 non-null  float64
 6   Windspeed                 25470 non-null  float64
 7   Uvindex                   25470 non-null  float64
 8   Precipitationprobability  25470 non-null  float64
 9   Month_sin                 25470 non-null  float64
 10  Month_cos                 25470 non-null  float64
 11  Hour_sin                  25470 non-null  float64
 12  Hour_cos                  25470 non-null  float64
 13  Day of week_si

In [85]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from boruta import BorutaPy
# from sklearn.preprocessing import StandardScaler
# import shap


# # Data Preprocessing: Remove rare/uncommon values (assuming they are below a threshold or beyond a threshold)
# df = df[(df['Total_aob_energy'] > df['Total_aob_energy'].quantile(0.01)) & 
#         (df['Total_aob_energy'] < df['Total_aob_energy'].quantile(0.99))]

# # # Feature Engineering: Create lag features
# # for lag in range(1, 25):
# #     df[f'lag_{lag}'] = df['Total_aob_energy'].shift(lag)

# # # Feature Engineering: Create rolling window features
# # df['rolling_mean_24'] = df['Total_aob_energy'].rolling(window=24).mean()
# # df['rolling_std_24'] = df['Total_aob_energy'].rolling(window=24).std()

# # # Drop rows with NaN values created by lag and rolling features
# # df.dropna(inplace=True)

# # Split data into features and target
# X = df.drop(['Total_aob_energy'], axis=1)
# y = df['Total_aob_energy']

# # Split the data into training and testing sets using time series split
# tscv = TimeSeriesSplit(n_splits=5)
# train_indices, test_indices = list(tscv.split(X))[-1]
# X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
# y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# # Feature Selection using Boruta
# np.int = np.int32
# np.float = np.float64
# np.bool = np.bool_
# rf = RandomForestRegressor(n_jobs=-1, random_state=42)
# boruta_selector = BorutaPy(rf, n_estimators=1000, random_state=1)

# # Ensure that the BorutaPy library is using int instead of np.int
# y_train_boruta = y_train.values.astype(int)
# boruta_selector.fit(X_train.values, y_train_boruta)
# X_train_selected = boruta_selector.transform(X_train.values)
# X_test_selected = boruta_selector.transform(X_test.values)

# # Standardize features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_selected)
# X_test_scaled = scaler.transform(X_test_selected)

# # Model Training
# models = {
#     "Linear Regression": LinearRegression(),
#     "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
#     "Gradient Boosting": GradientBoostingRegressor(random_state=42),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "SVR": SVR()
# }

# def smape(y_true, y_pred):
#     return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

# for model_name, model in models.items():
#     model.fit(X_train_scaled, y_train)
    
#     # Predictions
#     y_train_pred = model.predict(X_train_scaled)
#     y_test_pred = model.predict(X_test_scaled)

#     # Training Errors
#     train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
#     train_mae = mean_absolute_error(y_train, y_train_pred)
#     train_smape = smape(y_train, y_train_pred)
#     train_r2 = r2_score(y_train, y_train_pred)

#     # Testing Errors
#     test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
#     test_mae = mean_absolute_error(y_test, y_test_pred)
#     test_smape = smape(y_test, y_test_pred)
#     test_r2 = r2_score(y_test, y_test_pred)

#     # Normalized RMSE
#     y_mean = np.mean(y_test)
#     test_nrmse = test_rmse / y_mean

#     print(f"{model_name}:")
#     print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
#     print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
#     print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
#     print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
#     print(f"  Test NRMSE: {test_nrmse:.4f}")

# # Hyperparameter Tuning for Random Forest
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid_rf, cv=tscv, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_search_rf.fit(X_train_scaled, y_train)

# print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# # Evaluate the best model
# best_rf_model = grid_search_rf.best_estimator_
# y_train_pred_rf_best = best_rf_model.predict(X_train_scaled)
# y_test_pred_rf_best = best_rf_model.predict(X_test_scaled)

# # Best Random Forest Training Errors
# best_rf_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf_best))
# best_rf_train_mae = mean_absolute_error(y_train, y_train_pred_rf_best)
# best_rf_train_smape = smape(y_train, y_train_pred_rf_best)
# best_rf_train_r2 = r2_score(y_train, y_train_pred_rf_best)

# # Best Random Forest Testing Errors
# best_rf_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_rf_best))
# best_rf_test_mae = mean_absolute_error(y_test, y_test_pred_rf_best)
# best_rf_test_smape = smape(y_test, y_test_pred_rf_best)
# best_rf_test_r2 = r2_score(y_test, y_test_pred_rf_best)
# best_rf_test_nrmse = best_rf_test_rmse / y_mean

# print("Best Random Forest Model:")
# print(f"  Train RMSE: {best_rf_train_rmse:.4f}, Test RMSE: {best_rf_test_rmse:.4f}")
# print(f"  Train MAE: {best_rf_train_mae:.4f}, Test MAE: {best_rf_test_mae:.4f}")
# print(f"  Train SMAPE: {best_rf_train_smape:.2f}%, Test SMAPE: {best_rf_test_smape:.2f}%")
# print(f"  Train R^2: {best_rf_train_r2:.4f}, Test R^2: {best_rf_test_r2:.4f}")
# print(f"  Test NRMSE: {best_rf_test_nrmse:.4f}")

# # Feature Importance using SHAP
# explainer = shap.TreeExplainer(best_rf_model)
# shap_values = explainer.shap_values(X_train_scaled)

# shap.summary_plot(shap_values, X_train_scaled, feature_names=X.columns[boruta_selector.support_])


In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
import shap


# #Data Preprocessing: Remove rare/uncommon values (assuming they are below a threshold or beyond a threshold)
# df = df[(df['Total_aob_energy'] > df['Total_aob_energy'].quantile(0.01)) & 
#         (df['Total_aob_energy'] < df['Total_aob_energy'].quantile(0.99))]


# # # Split data into features and target
# # X = df.drop(['Total_aob_energy'], axis=1)
# # y = df['Total_aob_energy']

# # Split the data into training and testing sets using time series split
# # tscv = TimeSeriesSplit(n_splits=5)
# # train_indices, test_indices = list(tscv.split(X))[-1]
# # X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
# # y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
# # Define training and testing sets
# train_data = df[df.index < '2023-12-01']
# test_data = df[df.index >= '2023-12-01']

# X_train = train_data.drop(['Total_aob_energy'], axis=1)
# y_train = train_data['Total_aob_energy']
# X_test = test_data.drop(['Total_aob_energy'], axis=1)
# y_test = test_data['Total_aob_energy']
# # Feature Selection using Boruta
# np.int = np.int32
# np.float = np.float64
# np.bool = np.bool_
# rf = RandomForestRegressor(n_jobs=-1, random_state=42)
# boruta_selector = BorutaPy(rf, n_estimators=1000, random_state=1)

# # Ensure that the BorutaPy library is using int instead of np.int
# y_train_boruta = y_train.values.astype(int)
# boruta_selector.fit(X_train.values, y_train_boruta)
# X_train_selected = boruta_selector.transform(X_train.values)
# X_test_selected = boruta_selector.transform(X_test.values)

# # Standardize features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_selected)
# X_test_scaled = scaler.transform(X_test_selected)

# # Model Training
# models = {
#     "Linear Regression": LinearRegression(),
#     "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
#     "Gradient Boosting": GradientBoostingRegressor(random_state=42),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "SVR": SVR()
# }

# def smape(y_true, y_pred):
#     return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

# for model_name, model in models.items():
#     model.fit(X_train_scaled, y_train)
    
#     # Predictions
#     y_train_pred = model.predict(X_train_scaled)
#     y_test_pred = model.predict(X_test_scaled)

#     # Training Errors
#     train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
#     train_mae = mean_absolute_error(y_train, y_train_pred)
#     train_smape = smape(y_train, y_train_pred)
#     train_r2 = r2_score(y_train, y_train_pred)

#     # Testing Errors
#     test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
#     test_mae = mean_absolute_error(y_test, y_test_pred)
#     test_smape = smape(y_test, y_test_pred)
#     test_r2 = r2_score(y_test, y_test_pred)

#     # Normalized RMSE
#     y_mean = np.mean(y_test)
#     test_nrmse = test_rmse / y_mean

#     print(f"{model_name}:")
#     print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
#     print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
#     print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
#     print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
#     print(f"  Test NRMSE: {test_nrmse:.4f}")

# # Hyperparameter Tuning for Random Forest
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid_rf, cv=tscv, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_search_rf.fit(X_train_scaled, y_train)

# print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# # Evaluate the best model
# best_rf_model = grid_search_rf.best_estimator_
# y_train_pred_rf_best = best_rf_model.predict(X_train_scaled)
# y_test_pred_rf_best = best_rf_model.predict(X_test_scaled)

# # Best Random Forest Training Errors
# best_rf_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf_best))
# best_rf_train_mae = mean_absolute_error(y_train, y_train_pred_rf_best)
# best_rf_train_smape = smape(y_train, y_train_pred_rf_best)
# best_rf_train_r2 = r2_score(y_train, y_train_pred_rf_best)

# # Best Random Forest Testing Errors
# best_rf_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_rf_best))
# best_rf_test_mae = mean_absolute_error(y_test, y_test_pred_rf_best)
# best_rf_test_smape = smape(y_test, y_test_pred_rf_best)
# best_rf_test_r2 = r2_score(y_test, y_test_pred_rf_best)
# best_rf_test_nrmse = best_rf_test_rmse / y_mean

# print("Best Random Forest Model:")
# print(f"  Train RMSE: {best_rf_train_rmse:.4f}, Test RMSE: {best_rf_test_rmse:.4f}")
# print(f"  Train MAE: {best_rf_train_mae:.4f}, Test MAE: {best_rf_test_mae:.4f}")
# print(f"  Train SMAPE: {best_rf_train_smape:.2f}%, Test SMAPE: {best_rf_test_smape:.2f}%")
# print(f"  Train R^2: {best_rf_train_r2:.4f}, Test R^2: {best_rf_test_r2:.4f}")
# print(f"  Test NRMSE: {best_rf_test_nrmse:.4f}")

# # Feature Importance using SHAP
# explainer = shap.TreeExplainer(best_rf_model)
# shap_values = explainer.shap_values(X_train_scaled)

# shap.summary_plot(shap_values, X_train_scaled, feature_names=X.columns[boruta_selector.support_])


In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import shap
import mifs
from boruta import BorutaPy

# Ensure that the index is a datetime object
df.index = pd.to_datetime(df.index)

# Remove rare/uncommon values (assuming they are below a threshold or beyond a threshold)
df = df[(df['Total_aob_energy'] > df['Total_aob_energy'].quantile(0.01)) & 
        (df['Total_aob_energy'] < df['Total_aob_energy'].quantile(0.99))]

# Define the date ranges for training and testing
train_data_split1 = df[((df.index.year == 2021) | (df.index.year == 2022)) | 
                       ((df.index.year == 2023) & (df.index.day <= 14))]
test_data_split1 = df[(df.index.year == 2023) & (df.index.day > 14)]

# Split data into features and target
X_train = train_data_split1.drop(['Total_aob_energy'], axis=1)
y_train = train_data_split1['Total_aob_energy']
X_test = test_data_split1.drop(['Total_aob_energy'], axis=1)
y_test = test_data_split1['Total_aob_energy']

# # Train and evaluate the model for Split 1
# model_split1 = LinearRegression()
# model_split1.fit(X_train_split1, y_train_split1)
# y_pred_split1 = model_split1.predict(X_test_split1)
# rmse_split1 = mean_squared_error(y_test_split1, y_pred_split1, squared=False)
# print(f"Split 1 RMSE: {rmse_split1}")

# # Verify the date ranges
# print("Split 1 - Train set:", X_train_split1.index.min(), "to", X_train_split1.index.max())
# print("Split 1 - Test set:", X_test_split1.index.min(), "to", X_test_split1.index.max())
# Split the data into training and testing sets using time series split
# tscv = TimeSeriesSplit(n_splits=5)
# tscv = TimeSeriesSplit(n_splits=2)
# for train_index, test_index in tscv.split(X):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# print(X_test, y_test)
# X_train = train_data.drop(['Total_aob_energy'], axis=1)
# y_train = train_data['Total_aob_energy']
# X_test = test_data.drop(['Total_aob_energy'], axis=1)
# y_test = test_data['Total_aob_energy']

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [88]:
# # Define the date ranges for training and testing
# train_data_split2 = df[((df.index.year == 2021) | (df.index.year == 2022)) | 
#                        ((df.index.year == 2023) & (df.index.day > 14))]
# test_data_split2 = df[(df.index.year == 2023) & (df.index.day <= 14)]

# # Split data into features and target
# X_train_split2 = train_data_split2.drop(['Total_aob_energy'], axis=1)
# y_train_split2 = train_data_split2['Total_aob_energy']
# X_test_split2 = test_data_split2.drop(['Total_aob_energy'], axis=1)
# y_test_split2 = test_data_split2['Total_aob_energy']

# # Train and evaluate the model for Split 2
# model_split2 = LinearRegression()
# model_split2.fit(X_train_split2, y_train_split2)
# y_pred_split2 = model_split2.predict(X_test_split2)
# rmse_split2 = mean_squared_error(y_test_split2, y_pred_split2, squared=False)
# print(f"Split 2 RMSE: {rmse_split2}")

# # Verify the date ranges
# print("Split 2 - Train set:", X_train_split2.index.min(), "to", X_train_split2.index.max())
# print("Split 2 - Test set:", X_test_split2.index.min(), "to", X_test_split2.index.max())

In [89]:
def evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Lasso": Lasso(),
        "Random Forest": RandomForestRegressor(),
        "SVR": SVR(),
        "Gradient Boosting": GradientBoostingRegressor()
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        results[name] = rmse
        print(f"{name} RMSE: {rmse}")
    return results

In [90]:
print("Evaluating models on Split 1:")
results_split1 = evaluate_models(X_train_split1, y_train_split1, X_test_split1, y_test_split1)

print("\nEvaluating models on Split 2:")
results_split2 = evaluate_models(X_train_split2, y_train_split2, X_test_split2, y_test_split2)

# Compare the results
print("\nComparison of RMSE for different models on both splits:")
comparison = pd.DataFrame({'Split 1': results_split1, 'Split 2': results_split2})
print(comparison)

Evaluating models on Split 1:
Linear Regression RMSE: 0.5648288142239578
Lasso RMSE: 0.6481424795914933



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest RMSE: 0.5229638711440869



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



SVR RMSE: 0.7396391904427702



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting RMSE: 0.5292810845213765

Evaluating models on Split 2:
Linear Regression RMSE: 0.5738316002497975
Lasso RMSE: 0.6713133542459168



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest RMSE: 0.5229947999167903



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



SVR RMSE: 0.7580266804156399
Gradient Boosting RMSE: 0.5243038991582571

Comparison of RMSE for different models on both splits:
                    Split 1   Split 2
Linear Regression  0.564829  0.573832
Lasso              0.648142  0.671313
Random Forest      0.522964  0.522995
SVR                0.739639  0.758027
Gradient Boosting  0.529281  0.524304



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



In [91]:
# Split the data into training and testing sets without any specific split
# Here we assume training data includes all data up to 2022, and testing data includes all data from 2023
train_data_no_split = df[df.index.year <= 2022]
test_data_no_split = df[df.index.year == 2023]

# Split data into features and target for No Split scenario
X_train_no_split = train_data_no_split.drop(['Total_aob_energy'], axis=1)
y_train_no_split = train_data_no_split['Total_aob_energy']
X_test_no_split = test_data_no_split.drop(['Total_aob_energy'], axis=1)
y_test_no_split = test_data_no_split['Total_aob_energy']

In [92]:
print("Evaluating models on No Splitting Scenario:")
results_no_split = evaluate_models(X_train_no_split, y_train_no_split, X_test_no_split, y_test_no_split)

print("\nEvaluating models on Split 1 (Training on first 2 weeks of each month of 2023):")
results_split1 = evaluate_models(X_train_split1, y_train_split1, X_test_split1, y_test_split1)

# Compare the results
print("\nComparison of RMSE for different models on No Splitting and Split 1:")
comparison = pd.DataFrame({'No Split': results_no_split, 'Split 1': results_split1})
print(comparison)


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Evaluating models on No Splitting Scenario:
Linear Regression RMSE: 0.919279108987351
Lasso RMSE: 1.0117330935267714



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest RMSE: 0.9043626350805807



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



SVR RMSE: 1.3414626158466347



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting RMSE: 0.8912791543212577

Evaluating models on Split 1 (Training on first 2 weeks of each month of 2023):
Linear Regression RMSE: 0.5648288142239578
Lasso RMSE: 0.6481424795914933



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest RMSE: 0.5216630105288359



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



SVR RMSE: 0.7396391904427702
Gradient Boosting RMSE: 0.5292810845213765

Comparison of RMSE for different models on No Splitting and Split 1:
                   No Split   Split 1
Linear Regression  0.919279  0.564829
Lasso              1.011733  0.648142
Random Forest      0.904363  0.521663
SVR                1.341463  0.739639
Gradient Boosting  0.891279  0.529281



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



In [93]:

# Feature Selection with Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)
lasso_features = (lasso.coef_ != 0)
print(lasso_features)

[ True  True  True False  True False  True  True False  True  True  True
  True  True False  True  True False]


In [94]:

# Feature Selection with Random Forest and Boruta
rf = RandomForestRegressor(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42)
boruta_selector.fit(X_train_scaled, y_train)
boruta_features = boruta_selector.support_
print(boruta_features)


[ True False  True  True  True False  True False  True  True  True  True
  True  True False  True  True False]


In [116]:
from sklearn.feature_selection import RFE
# Initialize the model
# Define the date ranges for training and testing for Split 1
train_data = df[((df.index.year == 2021) | (df.index.year == 2022)) | 
                       ((df.index.year == 2023) & (df.index.day <= 14))]
test_data = df[(df.index.year == 2023) & (df.index.day > 14)]

# Split data into features and target for Split 1
X_train_split1 = train_data.drop(['Total_aob_energy'], axis=1)
y_train_split1 = train_data['Total_aob_energy']
X_test_split1 = test_data.drop(['Total_aob_energy'], axis=1)
y_test_split1 = test_data['Total_aob_energy']

# Check the shape of the data before scaling
print(f"Shape of X_train_split1: {X_train.shape}")
print(f"Shape of y_train_split1: {y_train.shape}")
print(f"Shape of X_test_split1: {X_test.shape}")
print(f"Shape of y_test_split1: {y_test.shape}")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split1)
X_test_scaled = scaler.transform(X_test_split1)

# Check the shape of the data after scaling
print(f"Shape of X_train_scaled: {X_train_scaled.shape}")
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")

# Initialize the model
rf = RandomForestRegressor(n_jobs=-1, random_state=42)

# Initialize RFE with the model and step=1 to remove one feature at a time
rfe_selector = RFE(estimator=rf, n_features_to_select=5, step=1)

# Fit RFE
rfe_selector.fit(X_train_scaled, y_train_split1)

# Get the ranking of the features
rfe_ranking = rfe_selector.ranking_

# Get the support of the selected features
rfe_support = rfe_selector.support_

# Check the lengths of ranking and support arrays
print(f"Length of rfe_ranking: {len(rfe_ranking)}")
print(f"Length of rfe_support: {len(rfe_support)}")

# Ensure the lengths match
assert len(rfe_ranking) == X_train_split1.shape[1], "Mismatch in feature count and ranking length"
assert len(rfe_support) == X_train_split1.shape[1], "Mismatch in feature count and support length"

# Map the feature names to their RFE ranking
feature_names = X_train_split1.columns
rfe_ranking_named = {feature_names[i]: rfe_ranking[i] for i in range(len(feature_names))}
rfe_support_named = {feature_names[i]: rfe_support[i] for i in range(len(feature_names))}

print("RFE Ranking of Features:", rfe_ranking_named)
print("Selected Features:", rfe_support_named)

Shape of X_train_split1: (20320, 18)
Shape of y_train_split1: (20320,)
Shape of X_test_split1: (4640, 18)
Shape of y_test_split1: (4640,)
Shape of X_train_scaled: (20320, 18)
Shape of X_test_scaled: (4640, 18)
Length of rfe_ranking: 18
Length of rfe_support: 18
RFE Ranking of Features: {'Weekend': 8, 'Bank holiday': 16, 'Year': 9, 'Temperature': 2, 'Feelslike': 1, 'Weathertype': 12, 'Windspeed': 7, 'Uvindex': 15, 'Precipitationprobability': 1, 'Month_sin': 11, 'Month_cos': 4, 'Hour_sin': 3, 'Hour_cos': 6, 'Day of week_sin': 10, 'Day of week_cos': 13, 'Day of month_sin': 1, 'Day of month_cos': 5, 'Working_hours': 14}
Selected Features: {'Weekend': False, 'Bank holiday': False, 'Year': False, 'Temperature': False, 'Feelslike': True, 'Weathertype': False, 'Windspeed': False, 'Uvindex': False, 'Precipitationprobability': True, 'Month_sin': False, 'Month_cos': False, 'Hour_sin': False, 'Hour_cos': False, 'Day of week_sin': False, 'Day of week_cos': False, 'Day of month_sin': True, 'Day of m

In [None]:
estimator_SVR = SVR(kernel="linear")
estimator_lasso = Lasso(alpha=0.01)
estimator_rf =RandomForestRegressor(n_jobs=-1, random_state=42)
estimator_gradientvoosting = GradientBoostingRegressor()



# Initialize RFE with the model and step=1 to remove one feature at a time
rfe_selector = RFE(estimator=rf, n_features_to_select=5, step=1)

# Fit RFE
rfe_selector.fit(X_train_scaled, y_train_split1)

# Get the ranking of the features
rfe_ranking = rfe_selector.ranking_

# Get the support of the selected features
rfe_support = rfe_selector.support_

# Check the lengths of ranking and support arrays
print(f"Length of rfe_ranking: {len(rfe_ranking)}")
print(f"Length of rfe_support: {len(rfe_support)}")

# Ensure the lengths match
assert len(rfe_ranking) == X_train_split1.shape[1], "Mismatch in feature count and ranking length"
assert len(rfe_support) == X_train_split1.shape[1], "Mismatch in feature count and support length"

# Map the feature names to their RFE ranking
feature_names = X_train_split1.columns
rfe_ranking_named = {feature_names[i]: rfe_ranking[i] for i in range(len(feature_names))}
rfe_support_named = {feature_names[i]: rfe_support[i] for i in range(len(feature_names))}

print("RFE Ranking of Features:", rfe_ranking_named)
print("Selected Features:", rfe_support_named)

In [117]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split1)
X_test_scaled = scaler.transform(X_test_split1)

# List of models to evaluate
models = {
    "Random Forest": RandomForestRegressor(n_jobs=-1, random_state=42),
    "SVR": SVR(kernel="linear"),
    "Lasso": Lasso(alpha=0.01),
    "Gradient Boosting": GradientBoostingRegressor()
}

# Dictionary to store the top features for each model
top_features = {}

for model_name, model in models.items():
    # Initialize RFE with the model and step=1 to remove one feature at a time
    rfe_selector = RFE(estimator=model, n_features_to_select=1, step=1)

    # Fit RFE
    rfe_selector.fit(X_train_scaled, y_train_split1)

    # Get the ranking of the features
    rfe_ranking = rfe_selector.ranking_

    # Map the feature names to their RFE ranking
    feature_names = X_train_split1.columns
    rfe_ranking_named = {feature_names[i]: rfe_ranking[i] for i in range(len(feature_names))}

    # Sort features by ranking and get the top 10
    sorted_features = sorted(rfe_ranking_named.items(), key=lambda x: x[1])
    top_features[model_name] = sorted_features[:10]

    # Print the top features for the model
    print(f"Top 10 Features for {model_name}:")
    for feature, rank in top_features[model_name]:
        print(f"{feature}: {rank}")
    print("\n")

# Create a DataFrame to compare the top features for each model
comparison_df = pd.DataFrame({model: [feature for feature, rank in top_features[model]] for model in top_features})
print(comparison_df)

Top 10 Features for Random Forest:
Feelslike: 1
Precipitationprobability: 2
Day of month_sin: 3
Temperature: 4
Hour_sin: 5
Month_cos: 6
Day of month_cos: 7
Hour_cos: 8
Windspeed: 9
Weekend: 10


Top 10 Features for SVR:
Feelslike: 1
Hour_cos: 2
Temperature: 3
Windspeed: 4
Month_cos: 5
Month_sin: 6
Hour_sin: 7
Year: 8
Weekend: 9
Day of month_sin: 10


Top 10 Features for Lasso:
Feelslike: 1
Hour_cos: 2
Hour_sin: 3
Month_cos: 4
Weekend: 5
Day of month_sin: 6
Windspeed: 7
Uvindex: 8
Year: 9
Month_sin: 10


Top 10 Features for Gradient Boosting:
Feelslike: 1
Temperature: 2
Hour_sin: 3
Month_cos: 4
Weekend: 5
Hour_cos: 6
Year: 7
Month_sin: 8
Day of month_sin: 9
Day of week_sin: 10


              Random Forest               SVR             Lasso  \
0                 Feelslike         Feelslike         Feelslike   
1  Precipitationprobability          Hour_cos          Hour_cos   
2          Day of month_sin       Temperature          Hour_sin   
3               Temperature         Windspeed

In [95]:

# Feature Selection with SHAP and RandomForest
rf_shap = RandomForestRegressor(n_estimators=100, random_state=42)
rf_shap.fit(X_test_scaled, y_test)
explainer = shap.TreeExplainer(rf_shap)
shap_values = explainer.shap_values(X_test_scaled)
shap_sum = np.abs(shap_values).mean(axis=0)
shap_features = shap_sum > np.percentile(shap_sum, 75)  # Top 25% SHAP values
print(shap_features)


[ True False False  True  True False False False False False False  True
  True False False False False False]


In [96]:
from sklearn.svm import SVR

# Fit an SVR model
svr = SVR(kernel='linear')
svr.fit(X_train_scaled, y_train)

# Extract feature importance based on the coefficients
svr_features = np.abs(svr.coef_[0]) > np.percentile(np.abs(svr.coef_[0]), 75)  # Top 25% features
print("Features selected by SVR (Top 25%):", svr_features)

Features selected by SVR (Top 25%): [False False False  True  True False  True False False  True  True False
 False False False False False False]


In [97]:

# Feature Selection with mRMR
# define MI_FS feature selection method
# print('mifs')
# feat_selector = mifs.MutualInformationFeatureSelector()
# feat_selector.fit(X_train_scaled,y_train)
# # check selected features
# feat_selector._support_mask

# # check ranking of features
# feat_selector.ranking_

# # call transform() on X to filter it down to selected features
# X_filtered = feat_selector.transform(X_train_scaled)
# X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
# selected_features_mrmr = pymrmr.mRMR(X_train_df, 'MIQ', 10)  # Top 10 features

# Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train_scaled[:, boruta_features], y_train)  # Change as needed for each FS method
    y_pred = model.predict(X_test_scaled[:, boruta_features])
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} with Boruta FS - RMSE: {rmse:.4f}, R²: {r2:.4f}")

# Compare and display selected features for each method
print("Features selected by Lasso:", X_train.columns[lasso_features].tolist())
print("Features selected by Boruta:", X_train.columns[boruta_features].tolist())
print("Features selected by SHAP (Top 25%):", X_train.columns[shap_features].tolist())
print("Features selected by SVR (Top 25%):", X_train.columns[svr_features].tolist())

# print("Features selected by mRMR (Top 10):", X_filtered)


Linear Regression with Boruta FS - RMSE: 0.8993, R²: 0.4058
Random Forest with Boruta FS - RMSE: 0.8302, R²: 0.4937
Gradient Boosting with Boruta FS - RMSE: 0.8298, R²: 0.4941
Features selected by Lasso: ['Weekend', 'Bank holiday', 'Year', 'Feelslike', 'Windspeed', 'Uvindex', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'Day of week_sin', 'Day of month_sin', 'Day of month_cos']
Features selected by Boruta: ['Weekend', 'Year', 'Temperature', 'Feelslike', 'Windspeed', 'Precipitationprobability', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'Day of week_sin', 'Day of month_sin', 'Day of month_cos']
Features selected by SHAP (Top 25%): ['Weekend', 'Temperature', 'Feelslike', 'Hour_sin', 'Hour_cos']
Features selected by SVR (Top 25%): ['Temperature', 'Feelslike', 'Windspeed', 'Month_sin', 'Month_cos']


In [118]:

def cv_rmse(y_true, y_pred):
    """
    Calculate the Coefficient of Variation of Root Mean Square Error (CV(RMSE))
    :param y_true: array-like of true values
    :param y_pred: array-like of predicted values
    :return: CV(RMSE) value
    """
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mean_y_true = np.mean(y_true)
    cv_rmse_value = rmse / mean_y_true
    return cv_rmse_value

def md_mape(y_true, y_pred):
    """
    Calculate the Median Absolute Percentage Error (MD(MAPE))
    :param y_true: array-like of true values
    :param y_pred: array-like of predicted values
    :return: MD(MAPE) value
    """
    ape = np.abs((y_true - y_pred) / y_true) * 100
    md_mape_value = np.median(ape)
    return md_mape_value

In [98]:
def combine_feature_importances(X_columns, lasso_features, boruta_features, shap_features, svr_features, weights=None):
    if weights is None:
        weights = {'lasso': 1, 'boruta': 1, 'shap': 1, 'svr': 1}
    
    # Initialize scores array
    feature_scores = np.zeros(len(X_columns))

    # Assign scores based on feature selection outcomes
    feature_scores += weights['lasso'] * lasso_features
    feature_scores += weights['boruta'] * boruta_features
    feature_scores += weights['shap'] * shap_features
    feature_scores += weights['svr'] * svr_features

    # Normalize scores
    max_score = np.max(feature_scores)
    if max_score > 0:
        feature_scores /= max_score
    
    # Decide on a threshold to select features
    selected_features = feature_scores > 0.5  # Select features with a score above 0.5
   

    return selected_features

# Combine feature importances
combined_features = combine_feature_importances(
    X_columns=X_train.columns, 
    lasso_features=lasso_features,
    boruta_features=boruta_features,
    shap_features=shap_features,
    svr_features=svr_features
    #weights={'lasso': 1, 'boruta': 1.5, 'shap': 1.5, 'svr': 1}
)

print("Combined Feature Selection:")
print(X_train.columns[combined_features])

Combined Feature Selection:
Index(['Weekend', 'Temperature', 'Feelslike', 'Windspeed', 'Month_sin',
       'Month_cos', 'Hour_sin', 'Hour_cos'],
      dtype='object')


In [119]:
# Filter training and testing sets with combined features
X_train_selected = X_train.loc[:, combined_features]
X_test_selected = X_test.loc[:, combined_features]
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR()
}

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Training Errors
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_smape = smape(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_cv_rmse = cv_rmse(y_train, y_train_pred)
    train_mdmape = md_mape(y_train, y_train_pred)

    # Testing Errors
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_smape = smape(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_cv_rmse = cv_rmse(y_test, y_test_pred)
    test_mdmape = md_mape(y_test, y_test_pred)

    # Normalized RMSE
    y_mean = np.mean(y_test)
    test_nrmse = test_rmse / y_mean

    print(f"{model_name}:")
    print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
    print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
    print(f"  Test NRMSE: {test_nrmse:.4f}")
    print(f"  Train CV RMSE: {train_cv_rmse:.4f}, Test CV RMSE: {test_cv_rmse:.4f}")
    print(f"  Train Median Mape: {train_mdmape:.4f}, Test Median Mape: {test_mdmape:.4f}")


Linear Regression:
  Train RMSE: 0.9751, Test RMSE: 0.9113
  Train MAE: 0.7397, Test MAE: 0.6585
  Train SMAPE: 41.18%, Test SMAPE: 33.49%
  Train R^2: 0.3980, Test R^2: 0.3898
  Test NRMSE: 0.4400
  Train CV RMSE: 0.5238, Test CV RMSE: 0.4400
  Train Median Mape: 36.2305, Test Median Mape: 27.8013
Random Forest:
  Train RMSE: 0.3759, Test RMSE: 0.9656
  Train MAE: 0.2342, Test MAE: 0.6494
  Train SMAPE: 12.72%, Test SMAPE: 30.42%
  Train R^2: 0.9106, Test R^2: 0.3149
  Test NRMSE: 0.4662
  Train CV RMSE: 0.2019, Test CV RMSE: 0.4662
  Train Median Mape: 8.8152, Test Median Mape: 22.1923
Gradient Boosting:
  Train RMSE: 0.8252, Test RMSE: 0.8684
  Train MAE: 0.5782, Test MAE: 0.5940
  Train SMAPE: 29.92%, Test SMAPE: 28.13%
  Train R^2: 0.5689, Test R^2: 0.4460
  Test NRMSE: 0.4192
  Train CV RMSE: 0.4433, Test CV RMSE: 0.4192
  Train Median Mape: 24.3147, Test Median Mape: 21.3548
Ridge:
  Train RMSE: 0.9751, Test RMSE: 0.9113
  Train MAE: 0.7397, Test MAE: 0.6585
  Train SMAPE: 41.18

In [120]:
# Filter training and testing sets with combined features
X_train_selected = X_train.loc[:, shap_features]
X_test_selected = X_test.loc[:, shap_features]
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR()
}

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

   # Training Errors
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_smape = smape(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_cv_rmse = cv_rmse(y_train, y_train_pred)
    train_mdmape = md_mape(y_train, y_train_pred)

    # Testing Errors
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_smape = smape(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_cv_rmse = cv_rmse(y_test, y_test_pred)
    test_mdmape = md_mape(y_test, y_test_pred)

    # Normalized RMSE
    y_mean = np.mean(y_test)
    test_nrmse = test_rmse / y_mean

    print(f"{model_name}:")
    print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
    print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
    print(f"  Test NRMSE: {test_nrmse:.4f}")
    print(f"  Train CV RMSE: {train_cv_rmse:.4f}, Test CV RMSE: {test_cv_rmse:.4f}")
    print(f"  Train Median Mape: {train_mdmape:.4f}, Test Median Mape: {test_mdmape:.4f}")


Linear Regression:
  Train RMSE: 0.9938, Test RMSE: 0.9148
  Train MAE: 0.7566, Test MAE: 0.6567
  Train SMAPE: 42.52%, Test SMAPE: 32.29%
  Train R^2: 0.3747, Test R^2: 0.3851
  Test NRMSE: 0.4416
  Train CV RMSE: 0.5339, Test CV RMSE: 0.4416
  Train Median Mape: 37.5473, Test Median Mape: 27.4366
Random Forest:
  Train RMSE: 0.7441, Test RMSE: 0.8986
  Train MAE: 0.5001, Test MAE: 0.6125
  Train SMAPE: 25.78%, Test SMAPE: 28.75%
  Train R^2: 0.6494, Test R^2: 0.4067
  Test NRMSE: 0.4338
  Train CV RMSE: 0.3997, Test CV RMSE: 0.4338
  Train Median Mape: 19.8100, Test Median Mape: 21.9836
Gradient Boosting:
  Train RMSE: 0.8670, Test RMSE: 0.8454
  Train MAE: 0.6049, Test MAE: 0.5779
  Train SMAPE: 31.03%, Test SMAPE: 27.01%
  Train R^2: 0.5241, Test R^2: 0.4749
  Test NRMSE: 0.4081
  Train CV RMSE: 0.4657, Test CV RMSE: 0.4081
  Train Median Mape: 25.2373, Test Median Mape: 20.4758
Ridge:
  Train RMSE: 0.9938, Test RMSE: 0.9148
  Train MAE: 0.7566, Test MAE: 0.6567
  Train SMAPE: 42.5

In [121]:
# Filter training and testing sets with combined features
X_train_selected = X_train.loc[:, boruta_features]
X_test_selected = X_test.loc[:, boruta_features]
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR()
}

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

   # Training Errors
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_smape = smape(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_cv_rmse = cv_rmse(y_train, y_train_pred)
    train_mdmape = md_mape(y_train, y_train_pred)

    # Testing Errors
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_smape = smape(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_cv_rmse = cv_rmse(y_test, y_test_pred)
    test_mdmape = md_mape(y_test, y_test_pred)

    # Normalized RMSE
    y_mean = np.mean(y_test)
    test_nrmse = test_rmse / y_mean

    print(f"{model_name}:")
    print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
    print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
    print(f"  Test NRMSE: {test_nrmse:.4f}")
    print(f"  Train CV RMSE: {train_cv_rmse:.4f}, Test CV RMSE: {test_cv_rmse:.4f}")
    print(f"  Train Median Mape: {train_mdmape:.4f}, Test Median Mape: {test_mdmape:.4f}")


Linear Regression:
  Train RMSE: 0.9628, Test RMSE: 0.8993
  Train MAE: 0.7267, Test MAE: 0.6507
  Train SMAPE: 40.67%, Test SMAPE: 32.77%
  Train R^2: 0.4131, Test R^2: 0.4058
  Test NRMSE: 0.4341
  Train CV RMSE: 0.5172, Test CV RMSE: 0.4341
  Train Median Mape: 35.5204, Test Median Mape: 27.3019
Random Forest:
  Train RMSE: 0.2249, Test RMSE: 0.8302
  Train MAE: 0.1493, Test MAE: 0.5689
  Train SMAPE: 8.39%, Test SMAPE: 25.74%
  Train R^2: 0.9680, Test R^2: 0.4937
  Test NRMSE: 0.4008
  Train CV RMSE: 0.1208, Test CV RMSE: 0.4008
  Train Median Mape: 6.6519, Test Median Mape: 20.4822
Gradient Boosting:
  Train RMSE: 0.7858, Test RMSE: 0.8298
  Train MAE: 0.5455, Test MAE: 0.5746
  Train SMAPE: 28.14%, Test SMAPE: 26.19%
  Train R^2: 0.6090, Test R^2: 0.4941
  Test NRMSE: 0.4006
  Train CV RMSE: 0.4221, Test CV RMSE: 0.4006
  Train Median Mape: 23.4377, Test Median Mape: 22.1568
Ridge:
  Train RMSE: 0.9628, Test RMSE: 0.8993
  Train MAE: 0.7267, Test MAE: 0.6507
  Train SMAPE: 40.67%

In [122]:
# Filter training and testing sets with combined features
X_train_selected = X_train.loc[:, svr_features]
X_test_selected = X_test.loc[:, svr_features]
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR()
}

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Training Errors
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_smape = smape(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_cv_rmse = cv_rmse(y_train, y_train_pred)
    train_mdmape = md_mape(y_train, y_train_pred)

    # Testing Errors
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_smape = smape(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_cv_rmse = cv_rmse(y_test, y_test_pred)
    test_mdmape = md_mape(y_test, y_test_pred)

    # Normalized RMSE
    y_mean = np.mean(y_test)
    test_nrmse = test_rmse / y_mean

    print(f"{model_name}:")
    print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
    print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
    print(f"  Test NRMSE: {test_nrmse:.4f}")
    print(f"  Train CV RMSE: {train_cv_rmse:.4f}, Test CV RMSE: {test_cv_rmse:.4f}")
    print(f"  Train Median Mape: {train_mdmape:.4f}, Test Median Mape: {test_mdmape:.4f}")


Linear Regression:
  Train RMSE: 1.0233, Test RMSE: 0.9808
  Train MAE: 0.7718, Test MAE: 0.6763
  Train SMAPE: 41.34%, Test SMAPE: 32.11%
  Train R^2: 0.3371, Test R^2: 0.2932
  Test NRMSE: 0.4735
  Train CV RMSE: 0.5497, Test CV RMSE: 0.4735
  Train Median Mape: 36.8746, Test Median Mape: 25.2924
Random Forest:
  Train RMSE: 0.8103, Test RMSE: 1.1085
  Train MAE: 0.5580, Test MAE: 0.7534
  Train SMAPE: 29.09%, Test SMAPE: 35.00%
  Train R^2: 0.5843, Test R^2: 0.0973
  Test NRMSE: 0.5351
  Train CV RMSE: 0.4353, Test CV RMSE: 0.5351
  Train Median Mape: 23.9746, Test Median Mape: 26.4015
Gradient Boosting:
  Train RMSE: 0.9297, Test RMSE: 0.9971
  Train MAE: 0.6595, Test MAE: 0.6549
  Train SMAPE: 34.28%, Test SMAPE: 29.51%
  Train R^2: 0.4528, Test R^2: 0.2695
  Test NRMSE: 0.4814
  Train CV RMSE: 0.4994, Test CV RMSE: 0.4814
  Train Median Mape: 31.8002, Test Median Mape: 20.4599
Ridge:
  Train RMSE: 1.0233, Test RMSE: 0.9808
  Train MAE: 0.7719, Test MAE: 0.6763
  Train SMAPE: 41.3

In [123]:
# Filter training and testing sets with combined features
X_train_selected = X_train.loc[:, lasso_features]
X_test_selected = X_test.loc[:, lasso_features]
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR()
}

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Training Errors
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_smape = smape(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_cv_rmse = cv_rmse(y_train, y_train_pred)
    train_mdmape = md_mape(y_train, y_train_pred)

    # Testing Errors
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_smape = smape(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_cv_rmse = cv_rmse(y_test, y_test_pred)
    test_mdmape = md_mape(y_test, y_test_pred)

    # Normalized RMSE
    y_mean = np.mean(y_test)
    test_nrmse = test_rmse / y_mean

    print(f"{model_name}:")
    print(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    print(f"  Train SMAPE: {train_smape:.2f}%, Test SMAPE: {test_smape:.2f}%")
    print(f"  Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
    print(f"  Test NRMSE: {test_nrmse:.4f}")
    print(f"  Train CV RMSE: {train_cv_rmse:.4f}, Test CV RMSE: {test_cv_rmse:.4f}")
    print(f"  Train Median Mape: {train_mdmape:.4f}, Test Median Mape: {test_mdmape:.4f}")


Linear Regression:
  Train RMSE: 0.9640, Test RMSE: 0.8968
  Train MAE: 0.7307, Test MAE: 0.6490
  Train SMAPE: 41.29%, Test SMAPE: 32.97%
  Train R^2: 0.4117, Test R^2: 0.4091
  Test NRMSE: 0.4329
  Train CV RMSE: 0.5178, Test CV RMSE: 0.4329
  Train Median Mape: 35.8688, Test Median Mape: 27.6887
Random Forest:
  Train RMSE: 0.2247, Test RMSE: 0.8398
  Train MAE: 0.1480, Test MAE: 0.5756
  Train SMAPE: 8.30%, Test SMAPE: 26.05%
  Train R^2: 0.9680, Test R^2: 0.4819
  Test NRMSE: 0.4054
  Train CV RMSE: 0.1207, Test CV RMSE: 0.4054
  Train Median Mape: 6.5730, Test Median Mape: 20.6916
Gradient Boosting:
  Train RMSE: 0.7869, Test RMSE: 0.8224
  Train MAE: 0.5456, Test MAE: 0.5715
  Train SMAPE: 28.12%, Test SMAPE: 26.24%
  Train R^2: 0.6079, Test R^2: 0.5031
  Test NRMSE: 0.3970
  Train CV RMSE: 0.4227, Test CV RMSE: 0.3970
  Train Median Mape: 23.3417, Test Median Mape: 22.7441
Ridge:
  Train RMSE: 0.9640, Test RMSE: 0.8968
  Train MAE: 0.7307, Test MAE: 0.6490
  Train SMAPE: 41.29%

In [104]:
from matplotlib.dates import DateFormatter, HourLocator
# Create a DataFrame for easier plotting
df_error = pd.DataFrame({
    'Actual Consumption': y_test,
    'Predicted Consumption': y_test_pred
})


# # Ensure the DataFrame is sorted by date if it's not already sorted
# df_error.sort_index(inplace=True)

# # Set the theme for seaborn plots
# sns.set_theme(style="whitegrid")

# # Determine the number of one-week periods in the test data
# one_week = pd.Timedelta(weeks=1)
# start_date = df_error.index.min()
# end_date = df_error.index.max()
# num_plots = (end_date - start_date) // one_week + 1

# # Generate subplots for each one-week period
# fig, axes = plt.subplots(num_plots, 1, figsize=(14, 5*num_plots), sharey=True)

# for i in range(num_plots):
#     start_period = start_date + i * one_week
#     end_period = min(start_date + (i+1) * one_week, end_date)
    
#     # Filter data for the current one-week period
#     df_period = df_error[start_period:end_period]

#     # Plot actual vs predicted consumption for the current period
#     axes[i].plot(df_period.index, df_period['Actual Consumption'], label='Actual Consumption', color='blue', linewidth=2)
#     axes[i].plot(df_period.index, df_period['Predicted Consumption'], label='Predicted Consumption', color='red', linestyle='--', linewidth=2)

#     axes[i].set_title(f'Energy Consumption: {start_period.strftime("%b %d, %Y")} - {end_period.strftime("%b %d, %Y")}')
#     axes[i].set_ylabel('Energy Consumption')
#     axes[i].legend()
    
#     # Set major and minor ticks for the x-axis
#     axes[i].xaxis.set_major_locator(HourLocator(interval=4))  # Major ticks every 4 hours
#     axes[i].xaxis.set_minor_locator(HourLocator())  # Minor ticks for each hour
#     axes[i].xaxis.set_major_formatter(DateFormatter('%b %d\n%H:%M'))  # Month Day and Hour:Minute
#     axes[i].tick_params(axis='x', which='both', rotation=90)

# plt.tight_layout()  # Adjust the layout to make room for rotated date labels
# plt.show()



In [105]:
import plotly.graph_objects as go

In [106]:
df_error = df_error.reset_index("Time")

In [107]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_error["Time"].to_numpy(), y=df_error["Actual Consumption"].to_numpy(),
                    mode='lines+markers',
                    name='Actual Consumption'))
fig.add_trace(go.Scatter(x=df_error["Time"].to_numpy(), y=df_error["Predicted Consumption"].to_numpy(),
                    mode='lines+markers',
                    name='Predicted Consumption'))


In [108]:
import plotly
plotly.offline.plot(fig, filename='comparison_chart.html')

'comparison_chart.html'