In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


pd.set_option("display.max_columns", None)

# Dataset credit goes to
#### https://github.com/LuisM78/Appliances-energy-prediction-data
#### https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction

##### The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).

| Variable Name | Role     | Type        | Description | Units  | Missing Values |
|----------------|----------|-------------|--------------|--------|----------------|
| date           | Feature  | Date        | Timestamp of observation | — | no |
| Appliances     | Target   | Integer     | Energy use in household appliances | Wh | no |
| lights         | Feature  | Integer     | Energy use of lights in the house | Wh | no |
| T1             | Feature  | Continuous  | Temperature in kitchen area | °C | no |
| RH_1           | Feature  | Continuous  | Humidity in kitchen area | % | no |
| T2             | Feature  | Continuous  | Temperature in living room | °C | no |
| RH_2           | Feature  | Continuous  | Humidity in living room | % | no |
| T3             | Feature  | Continuous  | Temperature in laundry room | °C | no |
| RH_3           | Feature  | Continuous  | Humidity in laundry room | % | no |
| T4             | Feature  | Continuous  | Temperature in office room | °C | no |
| RH_4           | Feature  | Continuous  | Humidity in office room | % | no |
| T5             | Feature  | Continuous  | Temperature in bathroom | °C | no |
| RH_5           | Feature  | Continuous  | Humidity in bathroom | % | no |
| T6             | Feature  | Continuous  | Temperature in north bedroom | °C | no |
| RH_6           | Feature  | Continuous  | Humidity in north bedroom | % | no |
| T7             | Feature  | Continuous  | Temperature in ironing room | °C | no |
| RH_7           | Feature  | Continuous  | Humidity in ironing room | % | no |
| T8             | Feature  | Continuous  | Temperature in teenager room | °C | no |
| RH_8           | Feature  | Continuous  | Humidity in teenager room | % | no |
| T9             | Feature  | Continuous  | Temperature in parents’ room | °C | no |
| RH_9           | Feature  | Continuous  | Humidity in parents’ room | % | no |
| T_out          | Feature  | Continuous  | Outdoor temperature | °C | no |
| Press_mm_hg    | Feature  | Continuous  | Outdoor pressure | mm Hg | no |
| RH_out         | Feature  | Continuous  | Outdoor humidity | % | no |
| Windspeed      | Feature  | Continuous  | Wind speed | m/s | no |
| Visibility     | Feature  | Continuous  | Outdoor visibility | km | no |
| Tdewpoint      | Feature  | Continuous  | Dew point temperature | °C | no |
| rv1            | Feature  | Continuous  | Random variable 1 (for testing purposes) | — | no |
| rv2            | Feature  | Continuous  | Random variable 2 (for testing purposes) | — | no |



# Load the Dataset

In [15]:
CSV_LOCATION = "Dataset/energydata_complete.csv"

df = pd.read_csv(CSV_LOCATION)
df.sample(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
4677,2016-02-13 04:30:00,50,0,20.7,38.09,18.6,39.09,20.926667,39.79,18.89,39.09,19.038889,61.973889,1.463333,89.0,19.2,34.427778,22.39,45.78,18.5,42.363333,0.9,739.95,89.5,2.5,23.5,-0.7,15.496133,15.496133
16198,2016-05-03 04:40:00,50,0,21.79,39.29,19.7,41.7,23.0,37.79,22.0,37.79,19.89,48.966667,7.245,50.09,20.89,39.59,22.89,45.663333,19.89,42.29,8.2,760.6,91.0,3.0,27.0,6.8,42.84086,42.84086


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [17]:
df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [18]:
df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,50.949283,7.910939,54.609083,20.267106,35.3882,22.029107,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,9.022034,6.090347,31.149806,2.109993,5.114208,1.956162,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,29.815,-6.065,1.0,15.39,23.2,16.306667,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,45.4,3.626667,30.025,18.7,31.5,20.79,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,49.09,7.3,55.29,20.033333,34.863333,22.1,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,53.663333,11.256,83.226667,21.6,39.0,23.39,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,96.321667,28.29,99.9,26.0,51.4,27.23,58.78,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [19]:
# Datetime features

df["date"] = pd.to_datetime(df["date"])
df["hour"] = df["date"].dt.hour
df["day_of_week"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

In [20]:
df.sample(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,hour,day_of_week,month,is_weekend
15676,2016-04-29 13:40:00,380,0,21.0,38.53,19.79,38.466667,23.29,35.8,19.6,37.53,18.79,41.4,10.666667,38.363333,19.0,33.2,20.39,38.5,18.6,37.0,7.866667,755.3,87.333333,3.333333,40.0,5.833333,30.713049,30.713049,13,4,4,0
13153,2016-04-12 01:10:00,50,0,22.2,43.466667,20.1,44.7,23.426667,40.826667,22.29,40.126667,21.0,45.29,7.966667,50.2,21.79,36.5,23.0,44.79,20.79,43.326667,8.883333,751.683333,86.5,3.666667,38.166667,6.683333,47.533516,47.533516,1,1,4,0


In [21]:
# cyclic encoding for hour

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)


# Drop the raw date
df = df.drop(columns = ['date'])

In [22]:
target = "Appliances"
X = df.drop(columns = [target])
y = df[target]

# Time based split
split_idx = int(len(df) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

In [23]:
# Lets the sample size for each

print(f"\nTraining sample size: {len(X_train)}")
print(f"Test sample size: {len(X_test)}")


Training sample size: 15788
Test sample size: 3947


In [12]:
model = RandomForestRegressor(
    n_estimators = 300,
    max_depth = 10,
    random_state = 41,
    n_jobs = -1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

mae = mean_absolute_error(y_test, y_pred),
r2 = r2_score(y_test, y_pred)

print(f"\nRMSE: {rmse:.2f}")
print(f"MAE: {mae[0]:.2f}")
print(f"R2 Score: {r2:.3f}")


RMSE: 204.52
MAE: 164.79
R2 Score: -4.047


In [13]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 5, 10, 15] 
}

model_grid = GridSearchCV(
    RandomForestRegressor(random_state=41),
    param_grid=param_grid,
    n_jobs = -1
)

model_grid.fit(X_train, y_train)

print("Best parameters:", model_grid.best_params_)
print("Best CV R2 Score:", model_grid.best_score_)


# Evaluate tuned model
y_pred_tuned = model_grid.predict(X_test)
print(f"\nTuned Random Forest R2 on Test: {r2_score(y_test, y_pred_tuned):.4f}")

Best parameters: {'max_depth': 5, 'n_estimators': 50}
Best CV R2 Score: 0.11809846438547249

Tuned Random Forest R2 on Test: -1.6052


In [14]:
# Now lets move to XGBoost

from xgboost import XGBRegressor

model_xgboost = XGBRegressor(
    n_estimators = 15,
    learning_rate = 0.01
)

model_xgboost.fit(X_train, y_train)

y_pred = model_xgboost.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.3f}")

RMSE: 88.81
MAE: 55.39
R2 Score: 0.048


In [15]:
# # Hyper tune this xg boost model

# param_grid_xgboost = {
#     "n_estimators" : [100, 200, 500],
#     "learning_rate" : [0.01, 0.05, 0.09, 0.1],
#     'max_depth' : [3, 5, 7 ,10],
#     'subsample' : [0.8, 0.9, 1.0]
# }

# Grid_model_xgboost = GridSearchCV(
#     estimator=XGBRegressor(),
#     param_grid=param_grid_xgboost,
#     n_jobs = -1
# )

# Grid_model_xgboost.fit(X_train, y_train)

# print("Best parameters:", Grid_model_xgboost.best_params_)
# print("Best CV R2 Score:", Grid_model_xgboost.best_score_)


# # Evaluate tuned model
# y_pred_tuned = Grid_model_xgboost.predict(X_test)
# print(f"\nTuned Random Forest R2 on Test: {r2_score(y_test, y_pred_tuned):.4f}")



In [16]:
# mse = mean_squared_error(y_test, y_pred_tuned)
# rmse = np.sqrt(mse)
# mae = mean_absolute_error(y_test, y_pred_tuned)
# r2 = r2_score(y_test, y_pred_tuned)

# print(f"RMSE: {rmse:.2f}")
# print(f"MAE: {mae:.2f}")
# print(f"R2 Score: {r2:.3f}")

In [17]:
# sanity check

xgb_simple = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_simple.fit(X_train, y_train)
y_pred_simple = xgb_simple.predict(X_test)

mse = mean_squared_error(y_test, y_pred_simple)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred_simple)
r2 = r2_score(y_test, y_pred_simple)

print(f"Simple XGB -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.3f}")


Simple XGB -> RMSE: 207.76, MAE: 175.52, R2: -4.208


In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # --- 1) Ensure sorted by time ---
# df["date"] = pd.to_datetime(df["date"])
# df = df.sort_values("date").reset_index(drop=True)

# # --- 2) Datetime features ---
# df["hour"] = df["date"].dt.hour
# df["day_of_week"] = df["date"].dt.dayofweek
# df["month"] = df["date"].dt.month
# df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# # Cyclic encoding for hour
# df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
# df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# --- 3) Lag & rolling features for Appliances (key improvement) ---
# 10‑min data: 6 steps = 1 hour, 3 steps = 30 min
df["Appliances_lag1"] = df["Appliances"].shift(1)
df["Appliances_lag2"] = df["Appliances"].shift(2)
df["Appliances_lag6"] = df["Appliances"].shift(6)

df["Appliances_roll3_mean"] = df["Appliances"].rolling(window=3).mean()
df["Appliances_roll6_mean"] = df["Appliances"].rolling(window=6).mean()

# Drop first rows with NaNs from lags/rollings
df = df.dropna().reset_index(drop=True)

# --- 4) Drop known noise / low‑value columns ---
# rv1 and rv2 are documented random variables
df = df.drop(columns=["rv1", "rv2"])

# Drop raw date if you don't want it as a feature
# df = df.drop(columns=["date"])

# --- 5) Define X, y ---
target = "Appliances"
X = df.drop(columns=[target])
y = df[target]

# --- 6) Time‑based split (no shuffling) ---
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (15783, 36) Test shape: (3946, 36)


In [25]:
model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,           # let trees grow deeper; you can tune later
    min_samples_leaf=5,       # helps generalization
    max_features="sqrt",
    random_state=41,
    n_jobs=-1
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nRMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.3f}")



RMSE: 39.44
MAE: 22.22
R2 Score: 0.812
