In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# ============================================
# 1. LOAD DATA
# ============================================
df = pd.read_csv("../dataset/preprocessed_energy_data.csv")

# ============================================
# 2. FEATURE ENGINEERING
# ============================================

# Additional lags (1, 2, 3, 6, 12, 24 hours)
for lag in [1, 2, 3, 6, 12, 24]:
    df[f'lag_{lag}'] = df['Energy_Usage (kWh)'].shift(lag)

# Rolling means & std smoothing
for w in [3, 6, 12]:
    df[f'roll_mean_{w}'] = df['Energy_Usage (kWh)'].rolling(window=w).mean()
    df[f'roll_std_{w}']  = df['Energy_Usage (kWh)'].rolling(window=w).std()

# Seasonal Fourier features (captures patterns very well)
df["sin_hour"] = np.sin(2 * np.pi * df["Hour"] / 24)
df["cos_hour"] = np.cos(2 * np.pi * df["Hour"] / 24)

df["sin_day"] = np.sin(2 * np.pi * df["DayOfWeek"] / 7)
df["cos_day"] = np.cos(2 * np.pi * df["DayOfWeek"] / 7)

# Drop NaNs created from lags
df.dropna(inplace=True)

# ============================================
# 3. OUTLIER HANDLING (CAPPING, NOT DROPPING)
# ============================================
Q1 = df['Energy_Usage (kWh)'].quantile(0.25)
Q3 = df['Energy_Usage (kWh)'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df['Energy_Usage (kWh)'] = np.where(df['Energy_Usage (kWh)'] > upper, upper,
                             np.where(df['Energy_Usage (kWh)'] < lower, lower,
                                      df['Energy_Usage (kWh)']))

# ============================================
# 4. TRAIN / TEST SPLIT
# ============================================
X = df.drop(columns=["Timestamp","Building_ID", "Energy_Usage (kWh)"])
y = df["Energy_Usage (kWh)"]
print("+"*100)
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, shuffle=False   # smaller test = more training data
)

# ============================================
# 5. SAME RANDOM FOREST MODEL (
# ============================================
model = RandomForestRegressor(
    n_estimators=400,
    max_depth=15,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# ============================================
# 6. EVALUATION
# ============================================
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n===== FINAL MODEL PERFORMANCE =====")
print(f"MAE  : {mae:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")

# Feature importance
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\n===== TOP FEATURES =====")
print(importances.head(15))

print("Saving the model")
joblib.dump(model,"../models/model.pkl")
# Save feature columns for Streamlit
feature_cols = X.columns.tolist()
joblib.dump(feature_cols, "../models/feature_cols.pkl")



++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Index(['Temperature (°C)', 'Humidity (%)', 'Building_Type', 'Occupancy_Level',
       'Hour', 'DayOfWeek', 'Month', 'lag_1', 'lag_2', 'rolling_mean_3',
       'lag_3', 'lag_6', 'lag_12', 'lag_24', 'roll_mean_3', 'roll_std_3',
       'roll_mean_6', 'roll_std_6', 'roll_mean_12', 'roll_std_12', 'sin_hour',
       'cos_hour', 'sin_day', 'cos_day'],
      dtype='object')

===== FINAL MODEL PERFORMANCE =====
MAE  : 10.5455
RMSE : 13.5429
R²   : 0.9886

===== TOP FEATURES =====
             Feature  Importance
7              lag_1    0.217297
9     rolling_mean_3    0.206414
8              lag_2    0.201819
14       roll_mean_3    0.200178
15        roll_std_3    0.161745
17        roll_std_6    0.001197
19       roll_std_12    0.001035
12            lag_12    0.000984
0   Temperature (°C)    0.000976
13            lag_24    0.000941
11             lag_6    0.000924
10             lag_3    0.0

['../models/feature_cols.pkl']