In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


pd.set_option("display.max_columns", None)

# Dataset credit goes to
#### https://github.com/LuisM78/Appliances-energy-prediction-data
#### https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction

##### The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).

| Variable Name | Role     | Type        | Description | Units  | Missing Values |
|----------------|----------|-------------|--------------|--------|----------------|
| date           | Feature  | Date        | Timestamp of observation | — | no |
| Appliances     | Target   | Integer     | Energy use in household appliances | Wh | no |
| lights         | Feature  | Integer     | Energy use of lights in the house | Wh | no |
| T1             | Feature  | Continuous  | Temperature in kitchen area | °C | no |
| RH_1           | Feature  | Continuous  | Humidity in kitchen area | % | no |
| T2             | Feature  | Continuous  | Temperature in living room | °C | no |
| RH_2           | Feature  | Continuous  | Humidity in living room | % | no |
| T3             | Feature  | Continuous  | Temperature in laundry room | °C | no |
| RH_3           | Feature  | Continuous  | Humidity in laundry room | % | no |
| T4             | Feature  | Continuous  | Temperature in office room | °C | no |
| RH_4           | Feature  | Continuous  | Humidity in office room | % | no |
| T5             | Feature  | Continuous  | Temperature in bathroom | °C | no |
| RH_5           | Feature  | Continuous  | Humidity in bathroom | % | no |
| T6             | Feature  | Continuous  | Temperature in north bedroom | °C | no |
| RH_6           | Feature  | Continuous  | Humidity in north bedroom | % | no |
| T7             | Feature  | Continuous  | Temperature in ironing room | °C | no |
| RH_7           | Feature  | Continuous  | Humidity in ironing room | % | no |
| T8             | Feature  | Continuous  | Temperature in teenager room | °C | no |
| RH_8           | Feature  | Continuous  | Humidity in teenager room | % | no |
| T9             | Feature  | Continuous  | Temperature in parents’ room | °C | no |
| RH_9           | Feature  | Continuous  | Humidity in parents’ room | % | no |
| T_out          | Feature  | Continuous  | Outdoor temperature | °C | no |
| Press_mm_hg    | Feature  | Continuous  | Outdoor pressure | mm Hg | no |
| RH_out         | Feature  | Continuous  | Outdoor humidity | % | no |
| Windspeed      | Feature  | Continuous  | Wind speed | m/s | no |
| Visibility     | Feature  | Continuous  | Outdoor visibility | km | no |
| Tdewpoint      | Feature  | Continuous  | Dew point temperature | °C | no |
| rv1            | Feature  | Continuous  | Random variable 1 (for testing purposes) | — | no |
| rv2            | Feature  | Continuous  | Random variable 2 (for testing purposes) | — | no |



# Load the Dataset

In [2]:
CSV_LOCATION = "Dataset/energydata_complete.csv"

df = pd.read_csv(CSV_LOCATION)
df.sample(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
3788,2016-02-07 00:20:00,40,0,22.7,43.2,22.0,42.79,21.6,44.7,19.1,44.433333,19.29,63.831111,10.6,60.866667,18.79,40.005,22.306667,51.172222,19.0,48.363333,10.7,742.433333,65.333333,8.666667,40.0,4.433333,28.006093,28.006093
7135,2016-03-01 06:10:00,50,0,20.29,35.56,17.79,37.59,21.0,36.0,19.633333,33.79,18.34,40.09,-1.356667,76.793333,19.5,29.1,21.23,40.79,18.29,39.0,-1.183333,761.75,98.0,2.0,32.5,-1.483333,38.421342,38.421342


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [4]:
df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [5]:
df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,50.949283,7.910939,54.609083,20.267106,35.3882,22.029107,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,9.022034,6.090347,31.149806,2.109993,5.114208,1.956162,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,29.815,-6.065,1.0,15.39,23.2,16.306667,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,45.4,3.626667,30.025,18.7,31.5,20.79,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,49.09,7.3,55.29,20.033333,34.863333,22.1,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,53.663333,11.256,83.226667,21.6,39.0,23.39,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,96.321667,28.29,99.9,26.0,51.4,27.23,58.78,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [6]:
# Datetime features

df["date"] = pd.to_datetime(df["date"])
df["hour"] = df["date"].dt.hour
df["day_of_week"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

In [7]:
df.sample(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,hour,day_of_week,month,is_weekend
5127,2016-02-16 07:30:00,60,20,19.926667,37.826667,17.566667,40.126667,20.7,40.126667,19.2,37.463333,17.89,48.433333,-3.86,81.63,18.6,32.7,20.7,42.245,17.89,41.7,-3.35,771.3,93.0,1.0,38.5,-4.3,38.71491,38.71491,7,1,2,0
18903,2016-05-21 23:30:00,140,0,24.721818,48.986636,24.44955,46.423904,26.763364,43.451818,24.028829,46.605586,22.961712,52.341869,22.877658,18.543333,23.927477,43.557455,24.365727,50.819091,23.163909,48.081818,18.45,752.05,76.5,2.5,40.0,14.2,6.641117,6.641117,23,5,5,1


In [8]:
# cyclic encoding for hour

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)


# Drop the raw date
df = df.drop(columns = ['date'])

In [9]:
target = "Appliances"
X = df.drop(columns = [target])
y = df[target]

# Time based split
split_idx = int(len(df) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

In [10]:
# Lets the sample size for each

print(f"\nTraining sample size: {len(X_train)}")
print(f"Test sample size: {len(X_test)}")


Training sample size: 15788
Test sample size: 3947


In [12]:
model = RandomForestRegressor(
    n_estimators = 300,
    max_depth = 10,
    random_state = 41,
    n_jobs = -1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

mae = mean_absolute_error(y_test, y_pred),
r2 = r2_score(y_test, y_pred)

print(f"\nRMSE: {rmse:.2f}")
print(f"MAE: {mae[0]:.2f}")
print(f"R2 Score: {r2:.3f}")


RMSE: 204.52
MAE: 164.79
R2 Score: -4.047


In [13]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 5, 10, 15] 
}

model_grid = GridSearchCV(
    RandomForestRegressor(random_state=41),
    param_grid=param_grid,
    n_jobs = -1
)

model_grid.fit(X_train, y_train)

print("Best parameters:", model_grid.best_params_)
print("Best CV R2 Score:", model_grid.best_score_)


# Evaluate tuned model
y_pred_tuned = model_grid.predict(X_test)
print(f"\nTuned Random Forest R2 on Test: {r2_score(y_test, y_pred_tuned):.4f}")

Best parameters: {'max_depth': 5, 'n_estimators': 50}
Best CV R2 Score: 0.11809846438547249

Tuned Random Forest R2 on Test: -1.6052


In [14]:
# Now lets move to XGBoost

from xgboost import XGBRegressor

model_xgboost = XGBRegressor(
    n_estimators = 15,
    learning_rate = 0.01
)

model_xgboost.fit(X_train, y_train)

y_pred = model_xgboost.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.3f}")

RMSE: 88.81
MAE: 55.39
R2 Score: 0.048


In [15]:
# # Hyper tune this xg boost model

# param_grid_xgboost = {
#     "n_estimators" : [100, 200, 500],
#     "learning_rate" : [0.01, 0.05, 0.09, 0.1],
#     'max_depth' : [3, 5, 7 ,10],
#     'subsample' : [0.8, 0.9, 1.0]
# }

# Grid_model_xgboost = GridSearchCV(
#     estimator=XGBRegressor(),
#     param_grid=param_grid_xgboost,
#     n_jobs = -1
# )

# Grid_model_xgboost.fit(X_train, y_train)

# print("Best parameters:", Grid_model_xgboost.best_params_)
# print("Best CV R2 Score:", Grid_model_xgboost.best_score_)


# # Evaluate tuned model
# y_pred_tuned = Grid_model_xgboost.predict(X_test)
# print(f"\nTuned Random Forest R2 on Test: {r2_score(y_test, y_pred_tuned):.4f}")



In [16]:
# mse = mean_squared_error(y_test, y_pred_tuned)
# rmse = np.sqrt(mse)
# mae = mean_absolute_error(y_test, y_pred_tuned)
# r2 = r2_score(y_test, y_pred_tuned)

# print(f"RMSE: {rmse:.2f}")
# print(f"MAE: {mae:.2f}")
# print(f"R2 Score: {r2:.3f}")

In [17]:
# sanity check

xgb_simple = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_simple.fit(X_train, y_train)
y_pred_simple = xgb_simple.predict(X_test)

mse = mean_squared_error(y_test, y_pred_simple)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred_simple)
r2 = r2_score(y_test, y_pred_simple)

print(f"Simple XGB -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.3f}")


Simple XGB -> RMSE: 207.76, MAE: 175.52, R2: -4.208


In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # --- 1) Ensure sorted by time ---
# df["date"] = pd.to_datetime(df["date"])
# df = df.sort_values("date").reset_index(drop=True)

# # --- 2) Datetime features ---
# df["hour"] = df["date"].dt.hour
# df["day_of_week"] = df["date"].dt.dayofweek
# df["month"] = df["date"].dt.month
# df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# # Cyclic encoding for hour
# df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
# df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# --- 3) Lag & rolling features for Appliances (key improvement) ---
# 10‑min data: 6 steps = 1 hour, 3 steps = 30 min
df["Appliances_lag1"] = df["Appliances"].shift(1)
df["Appliances_lag2"] = df["Appliances"].shift(2)
df["Appliances_lag6"] = df["Appliances"].shift(6)

df["Appliances_roll3_mean"] = df["Appliances"].rolling(window=3).mean()
df["Appliances_roll6_mean"] = df["Appliances"].rolling(window=6).mean()

print(df)

# Drop first rows with NaNs from lags/rollings
df = df.dropna().reset_index(drop=True)

# --- 4) Drop known noise / low‑value columns ---
# rv1 and rv2 are documented random variables
df = df.drop(columns=["rv1", "rv2"])

# Drop raw date if you don't want it as a feature
# df = df.drop(columns=["date"])

# --- 5) Define X, y ---
target = "Appliances"
X = df.drop(columns=[target])
y = df[target]

# --- 6) Time‑based split (no shuffling) ---
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


       Appliances  lights         T1       RH_1         T2       RH_2  \
0              60      30  19.890000  47.596667  19.200000  44.790000   
1              60      30  19.890000  46.693333  19.200000  44.722500   
2              50      30  19.890000  46.300000  19.200000  44.626667   
3              50      40  19.890000  46.066667  19.200000  44.590000   
4              60      40  19.890000  46.333333  19.200000  44.530000   
...           ...     ...        ...        ...        ...        ...   
19730         100       0  25.566667  46.560000  25.890000  42.025714   
19731          90       0  25.500000  46.500000  25.754000  42.080000   
19732         270      10  25.500000  46.596667  25.628571  42.768571   
19733         420      10  25.500000  46.990000  25.414000  43.036000   
19734         430      10  25.500000  46.600000  25.264286  42.971429   

              T3       RH_3         T4       RH_4         T5       RH_5  \
0      19.790000  44.730000  19.000000  45.56666

### Purpose of Preprocessing

The preprocessing steps transformed a negative R² score (indicating model performance worse than simply predicting the mean) into a positive 80% R² by addressing key issues in time series data handling, feature engineering, and data quality.


1. Temporal Feature Engineering
    Lagged features (Appliances_lag1, Appliances_lag2, Appliances_lag6) capture autocorrelation patterns where current appliance usage depends on recent past values—crucial for time series but absent in basic models. Rolling means (Appliances_roll3_mean, Appliances_roll6_mean) smooth noise and capture short-term trends, providing the model with temporal context that transforms it from pattern-blind to temporally aware. [conversation_history]

2. Noise Reduction
    Dropping rv1 and rv2 (documented random variables) eliminates pure noise that confuses models without signal, diluting predictive power across all features. NaN removal after feature creation ensures clean data without arbitrary imputation that could introduce bias in sequential patterns. [conversation_history]

3. Why R² Jumped to 80%
    These changes collectively gave the model explanatory power it lacked: temporal awareness prevented leakage-induced overfitting, engineered features provided the autocorrelation signal dominating appliance energy data, and noise removal sharpened focus on real patterns. Negative R² occurred because the original setup measured "how much worse than mean prediction," while proper preprocessing enabled actual forecasting capability.

In [25]:
model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,           # let trees grow deeper; you can tune later
    min_samples_leaf=5,       # helps generalization
    max_features="sqrt",
    random_state=41,
    n_jobs=-1
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nRMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.3f}")



RMSE: 39.44
MAE: 22.22
R2 Score: 0.812


In [14]:
from xgboost import XGBRegressor

# sanity check

xgb_simple = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_simple.fit(X_train, y_train)
y_pred_simple = xgb_simple.predict(X_test)

mse = mean_squared_error(y_test, y_pred_simple)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred_simple)
r2 = r2_score(y_test, y_pred_simple)

print(f"Simple XGB -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.3f}")


Simple XGB -> RMSE: 12.94, MAE: 6.67, R2: 0.980


In [17]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

et = ExtraTreesRegressor(
    n_estimators = 500,
    max_depth = None,
    min_samples_leaf = 3,
    max_features = "sqrt",
    random_state = 42,
    n_jobs = -1
)

et.fit(X_train, y_train)
y_pred_et = et.predict(X_test)

rmse_et = mean_squared_error(y_test, y_pred_et) ** 0.5
mae_et = mean_absolute_error(y_test, y_pred_et)
r2_et = r2_score(y_test, y_pred_et)

print(f"ExtraTrees -> RMSE: {rmse_et:.2f}, MAE: {mae_et:.2f}, R2: {r2_et:.3f}")

ExtraTrees -> RMSE: 45.73, MAE: 25.15, R2: 0.747


In [18]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

gbr = GradientBoostingRegressor(
    n_estimators = 300,
    learning_rate = 0.05,
    max_depth = 3,
    subsample = 0.8,
    random_state = 41
)

gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)

rmse_gbr = np.sqrt(mean_squared_error(y_test, y_pred_gbr))
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"GBR -> RMSE: {rmse_gbr:.2f}, MAE: {mae_gbr:.2f}, R2: {r2_gbr:.3f}")

GBR -> RMSE: 12.42, MAE: 7.39, R2: 0.981


In [20]:
# Linear + kernel model

from sklearn.linear_model import Ridge, Lasso

ridge = Ridge(alpha=10, random_state = 41)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)
print("Ridge R2:", r2_score(y_test, y_pred_ridge))

lasso = Lasso(alpha=0.001, random_state = 41, max_iter = 10000)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
print("Lasso R2:", r2_score(y_test, y_pred_lasso))

Ridge R2: 0.9999999999986329
Lasso R2: 0.9999999515669454


In [24]:
# Support Vector Regressor

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

svr_model = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR(kernel="rbf", C=20, epsilon=1.0, gamma="scale"))
])

svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

rmse_svr = np.sqrt(mean_squared_error(y_test, y_pred_svr))
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print(f"SVR -> RMSE: {rmse_svr:.2f}, MAE: {mae_svr:.2f}, R2 Score: {r2_svr:.3f}")

SVR -> RMSE: 62.15, MAE: 37.94, R2 Score: 0.533


In [25]:
# Using sklearn pipeline

import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------- 1. Custom transformer for feature engineering ----------
class TimeFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_col="date", target_col="Appliances"):
        self.datetime_col = datetime_col
        self.target_col = target_col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df = X.copy()
        
        # ensure datetime and sort
        df[self.datetime_col] = pd.to_datetime(df[self.datetime_col])
        df = df.sort_values(self.datetime_col).reset_index(drop=True)
        
        # time features
        df["hour"] = df[self.datetime_col].dt.hour
        df["day_of_week"] = df[self.datetime_col].dt.dayofweek
        df["month"] = df[self.datetime_col].dt.month
        df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)
        
        # cyclic encoding
        df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
        df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
        
        # lags & rolling of target
        df[f"{self.target_col}_lag1"] = df[self.target_col].shift(1)
        df[f"{self.target_col}_lag2"] = df[self.target_col].shift(2)
        df[f"{self.target_col}_lag6"] = df[self.target_col].shift(6)
        df[f"{self.target_col}_roll3_mean"] = df[self.target_col].rolling(3).mean()
        df[f"{self.target_col}_roll6_mean"] = df[self.target_col].rolling(6).mean()
        
        # drop rows with NaNs from lags/rollings
        df = df.dropna().reset_index(drop=True)
        
        # drop known noise & raw datetime
        for col in ["rv1", "rv2"]:
            if col in df.columns:
                df = df.drop(columns=col)
        df = df.drop(columns=[self.datetime_col])
        
        # X_out must NOT contain target
        if self.target_col in df.columns:
            df = df.drop(columns=[self.target_col])
        
        return df

# ---------- 2. Load data ----------
df = pd.read_csv("Dataset\energydata_complete.csv")  # path as needed

target_col = "Appliances"
y_full = df[target_col].copy()  # keep full y to align after transform

# ---------- 3. Build pipeline ----------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=5,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("feat_eng", TimeFeatureEngineer(datetime_col="date", target_col=target_col)),
    ("model", rf)
])

# ---------- 4. Time-based split (before fitting pipeline) ----------
# We need the same rows in X and y after dropping early lagged rows.
# So: first transform once to know how many rows are dropped.

X_fe = pipe.named_steps["feat_eng"].fit_transform(df, y_full)

# number of dropped rows = len(df) - len(X_fe)
n_dropped = len(df) - len(X_fe)

# align y with transformed X
y_fe = y_full.iloc[n_dropped:].reset_index(drop=True)

# time-based split on transformed data
split_idx = int(len(X_fe) * 0.8)
X_train, X_test = X_fe.iloc[:split_idx], X_fe.iloc[split_idx:]
y_train, y_test = y_fe.iloc[:split_idx], y_fe.iloc[split_idx:]

# ---------- 5. Fit model via pipeline (model stage only) ----------
pipe.named_steps["model"].fit(X_train, y_train)

y_pred = pipe.named_steps["model"].predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Random Forest via Pipeline:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R2  : {r2:.3f}")


Random Forest via Pipeline:
RMSE: 39.01
MAE : 21.97
R2  : 0.816


# ML Flow

In [1]:
import mlflow
import mlflow.xgboost
from mlflow.models import infer_signature

import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------------------------
# 1. Load and preprocess (same idea as before)
# ---------------------------
df = pd.read_csv("Dataset\energydata_complete.csv")
target_col = "Appliances"

# ensure datetime and order
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

# time features
df["hour"] = df["date"].dt.hour
df["day_of_week"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# cyclic encoding
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# lag & rolling features for Appliances
df["Appliances_lag1"] = df[target_col].shift(1)
df["Appliances_lag2"] = df[target_col].shift(2)
df["Appliances_lag6"] = df[target_col].shift(6)
df["Appliances_roll3_mean"] = df[target_col].rolling(3).mean()
df["Appliances_roll6_mean"] = df[target_col].rolling(6).mean()

# drop rows with NaNs (from lags/rollings)
df = df.dropna().reset_index(drop=True)

# drop random noise and raw date
df = df.drop(columns=["rv1", "rv2", "date"])

# define X, y
X = df.drop(columns=[target_col])
y = df[target_col]

# time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train:", X_train.shape, "Test:", X_test.shape)

# ---------------------------
# 2. Define XGBoost model
# ---------------------------
xgb_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=200,
    learning_rate=0.01,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method="hist"  # if supported; speeds up training
)

# ---------------------------
# 3. MLflow configuration
# ---------------------------
mlflow.set_experiment("appliances_energy_xgb")  # creates if not exists

with mlflow.start_run(run_name="xgb_baseline_lags_2"):
    # train
    xgb_model.fit(X_train, y_train)

    # predictions
    y_pred = xgb_model.predict(X_test)

    # metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"RMSE: {rmse:.2f} | MAE: {mae:.2f} | R2: {r2:.3f}")

    # log params
    mlflow.log_params(xgb_model.get_params())

    # log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # log model
    signature = infer_signature(X_train, xgb_model.predict(X_train))
    mlflow.xgboost.log_model(
        xgb_model,
        artifact_path="model",
        signature=signature
    )


Train: (15783, 36) Test: (3946, 36)


2026/01/27 07:04:58 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/01/27 07:04:58 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/01/27 07:04:58 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/01/27 07:04:58 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/01/27 07:04:58 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/01/27 07:04:58 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/01/27 07:04:58 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/27 07:04:58 INFO mlflow.store.db.utils: Updating database tables
2026/01/27 07:04:58 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/27 07:04:58 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/27 07:04:58 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/27 07:04:58 INFO alembic.runtime

RMSE: 30.03 | MAE: 14.80 | R2: 0.891




# Now Question is How to optimize model?

There are three ways of doing it:

1. GridSearch
2. RandomSearch
3. Bayesian

### Lets code with the best one - Optuna (Automatic hyperparameter optimization framework uses bayesian)

### Also called as "TPE (Tree-structured Parzen Estimator)"

In [None]:
import mlflow
import mlflow.xgboost
from mlflow.models import infer_signature

import optuna

import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------------------------
# 1. Load and preprocess (same as before)
# ---------------------------
df = pd.read_csv("Dataset/energydata_complete.csv")
target_col = "Appliances"

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

df["hour"] = df["date"].dt.hour
df["day_of_week"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

df["Appliances_lag1"] = df[target_col].shift(1)
df["Appliances_lag2"] = df[target_col].shift(2)
df["Appliances_lag6"] = df[target_col].shift(6)
df["Appliances_roll3_mean"] = df[target_col].rolling(3).mean()
df["Appliances_roll6_mean"] = df[target_col].rolling(6).mean()

df = df.dropna().reset_index(drop=True)
df = df.drop(columns=["rv1", "rv2", "date"])

X = df.drop(columns=[target_col])
y = df[target_col]

split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# ---------------------------
# 2. MLflow experiment
# ---------------------------
mlflow.set_experiment("appliances_energy_xgb_optuna")

# ---------------------------
# 3. Optuna objective function
# ---------------------------
def objective(trial):

    params = {
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    with mlflow.start_run(nested=True):

        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        mse = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)

        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        return rmse   # Optuna minimizes this

# ---------------------------
# 4. Run Optuna study
# ---------------------------
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)

# ---------------------------
# 5. Train & log BEST model
# ---------------------------
best_params = study.best_params
best_params.update({
    "objective": "reg:squarederror",
    "random_state": 42,
    "n_jobs": -1,
    "tree_method": "hist",
})

with mlflow.start_run(run_name="xgb_best_optuna"):
    best_model = XGBRegressor(**best_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_params(best_params)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    signature = infer_signature(X_train, best_model.predict(X_train))
    mlflow.xgboost.log_model(
        best_model,
        artifact_path="model",
        signature=signature
    )


[32m[I 2026-01-28 07:48:39,083][0m A new study created in memory with name: no-name-cce6e343-fe52-4661-a5c2-8c413fa6e309[0m
[32m[I 2026-01-28 07:48:41,653][0m Trial 0 finished with value: 14.888052719195054 and parameters: {'n_estimators': 312, 'learning_rate': 0.029257195925930842, 'max_depth': 7, 'subsample': 0.6625650125391243, 'colsample_bytree': 0.7794153451880645}. Best is trial 0 with value: 14.888052719195054.[0m
