In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
warnings.filterwarnings('ignore')

In [16]:
df = pd.read_csv("Energy_consumption_dataset.csv")

In [17]:
df.head()

Unnamed: 0,Month,Hour,DayOfWeek,Holiday,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,EnergyConsumption
0,1,0,Saturday,No,25.139433,43.431581,1565.693999,5,On,Off,2.774699,75.364373
1,1,1,Saturday,No,27.731651,54.225919,1411.064918,1,On,On,21.831384,83.401855
2,1,2,Saturday,No,28.704277,58.907658,1755.715009,2,Off,Off,6.764672,78.270888
3,1,3,Saturday,No,20.080469,50.371637,1452.316318,1,Off,On,8.623447,56.51985
4,1,4,Saturday,No,23.097359,51.401421,1094.130359,9,On,Off,3.071969,70.811732


In [18]:
def feature_conversion(df):
    # Convert categorical columns to appropriate data types
    categorical_cols = ['DayOfWeek', 'Holiday', 'HVACUsage', 'LightingUsage']
    df[categorical_cols] = df[categorical_cols].astype('category')

    # Create a new feature: TotalUsage
    df['TotalUsage'] = df['HVACUsage'].cat.codes + df['LightingUsage'].cat.codes

    # Convert categorical variables to dummy variables
    df_encoded = pd.get_dummies(df, drop_first=True)

    return df_encoded


In [19]:
df_encoded = feature_conversion(df)

In [20]:

# Define features and target
X = df_encoded.drop('EnergyConsumption', axis=1)
y = df_encoded['EnergyConsumption']

# Convert categorical variables to dummy variables
# X = pd.get_dummies(X, drop_first=True)

X

Unnamed: 0,Month,Hour,Temperature,Humidity,SquareFootage,Occupancy,RenewableEnergy,TotalUsage,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,Holiday_Yes,HVACUsage_On,LightingUsage_On
0,1,0,25.139433,43.431581,1565.693999,5,2.774699,1,False,True,False,False,False,False,False,True,False
1,1,1,27.731651,54.225919,1411.064918,1,21.831384,2,False,True,False,False,False,False,False,True,True
2,1,2,28.704277,58.907658,1755.715009,2,6.764672,0,False,True,False,False,False,False,False,False,False
3,1,3,20.080469,50.371637,1452.316318,1,8.623447,1,False,True,False,False,False,False,False,False,True
4,1,4,23.097359,51.401421,1094.130359,9,3.071969,1,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,12,6,26.338718,52.580000,1563.567259,7,20.591717,2,False,False,True,False,False,False,True,True,True
4996,12,17,20.007565,42.765607,1999.982252,5,7.536319,1,True,False,False,False,False,False,False,False,True
4997,12,13,26.226253,30.015975,1999.982252,5,28.162193,1,False,False,False,True,False,False,True,False,True
4998,12,8,24.673206,50.223939,1240.811298,2,20.918483,2,False,True,False,False,False,False,True,True,True


In [21]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

np.float64(7.997264011113966)

In [23]:
X

Unnamed: 0,Month,Hour,Temperature,Humidity,SquareFootage,Occupancy,RenewableEnergy,TotalUsage,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,Holiday_Yes,HVACUsage_On,LightingUsage_On
0,1,0,25.139433,43.431581,1565.693999,5,2.774699,1,False,True,False,False,False,False,False,True,False
1,1,1,27.731651,54.225919,1411.064918,1,21.831384,2,False,True,False,False,False,False,False,True,True
2,1,2,28.704277,58.907658,1755.715009,2,6.764672,0,False,True,False,False,False,False,False,False,False
3,1,3,20.080469,50.371637,1452.316318,1,8.623447,1,False,True,False,False,False,False,False,False,True
4,1,4,23.097359,51.401421,1094.130359,9,3.071969,1,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,12,6,26.338718,52.580000,1563.567259,7,20.591717,2,False,False,True,False,False,False,True,True,True
4996,12,17,20.007565,42.765607,1999.982252,5,7.536319,1,True,False,False,False,False,False,False,False,True
4997,12,13,26.226253,30.015975,1999.982252,5,28.162193,1,False,False,False,True,False,False,True,False,True
4998,12,8,24.673206,50.223939,1240.811298,2,20.918483,2,False,True,False,False,False,False,True,True,True


In [24]:
print("Mean Energy Consumption:", y_test.mean())
print("RMSE:", rmse)
print("Relative RMSE:", rmse / y_test.mean())

Mean Energy Consumption: 76.661094427204
RMSE: 7.997264011113966
Relative RMSE: 0.10431972137715857


In [25]:
import joblib

In [26]:
joblib.dump(model, 'energy_model.pkl')
joblib.dump(feature_conversion, 'conversion.pkl')

['conversion.pkl']

In [27]:
joblib.dump(X_train.columns.tolist(), 'training_columns.pkl')

['training_columns.pkl']