In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [2]:
df = pd.read_csv(
    "household_power_consumption.txt",
    sep=";",                 
    na_values="?",          
    low_memory=False
)

In [3]:
df

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.360,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
...,...,...,...,...,...,...,...,...,...
2075254,26/11/2010,20:58:00,0.946,0.000,240.43,4.0,0.0,0.0,0.0
2075255,26/11/2010,20:59:00,0.944,0.000,240.00,4.0,0.0,0.0,0.0
2075256,26/11/2010,21:00:00,0.938,0.000,239.82,3.8,0.0,0.0,0.0
2075257,26/11/2010,21:01:00,0.934,0.000,239.70,3.8,0.0,0.0,0.0


In [4]:
df = df.reset_index()

df["datetime"] = pd.to_datetime(
    df["Date"] + " " + df["Time"],
    format="%d/%m/%Y %H:%M:%S"
)

df = df.drop(columns=["Date","Time"])

df = df.set_index("datetime")

df = df.sort_index()

In [5]:
numeric_cols = [
    "Global_active_power","Global_reactive_power",
    "Voltage","Global_intensity",
    "Sub_metering_1","Sub_metering_2","Sub_metering_3"
]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

hourly = df[numeric_cols].resample("h").mean()

print(hourly.head())

                     Global_active_power  Global_reactive_power     Voltage  \
datetime                                                                      
2006-12-16 17:00:00             4.222889               0.229000  234.643889   
2006-12-16 18:00:00             3.632200               0.080033  234.580167   
2006-12-16 19:00:00             3.400233               0.085233  233.232500   
2006-12-16 20:00:00             3.268567               0.075100  234.071500   
2006-12-16 21:00:00             3.056467               0.076667  237.158667   

                     Global_intensity  Sub_metering_1  Sub_metering_2  \
datetime                                                                
2006-12-16 17:00:00         18.100000             0.0        0.527778   
2006-12-16 18:00:00         15.600000             0.0        6.716667   
2006-12-16 19:00:00         14.503333             0.0        1.433333   
2006-12-16 20:00:00         13.916667             0.0        0.000000   
2006-12-

In [6]:
df = hourly.copy()
df = df.dropna()

df['target'] = df['Global_active_power'].shift(-1)

In [7]:
q_low = df['target'].quantile(0.01)
q_high = df['target'].quantile(0.99)

df = df[(df['target'] > q_low) & (df['target'] < q_high)]

In [8]:
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['month'] = df.index.month

df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

df['dow_sin'] = np.sin(2*np.pi*df['dayofweek']/7)
df['dow_cos'] = np.cos(2*np.pi*df['dayofweek']/7)

In [9]:
# Short term memory
df['lag_1'] = df['Global_active_power'].shift(1)
df['lag_2'] = df['Global_active_power'].shift(2)
df['lag_24'] = df['Global_active_power'].shift(24)
df['lag_168'] = df['Global_active_power'].shift(168)


In [10]:
df['rolling_mean_24'] = df['Global_active_power'].shift(1).rolling(24).mean()

In [11]:
df = df.dropna()

In [12]:
df

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,target,hour,dayofweek,month,hour_sin,hour_cos,dow_sin,dow_cos,lag_1,lag_2,lag_24,lag_168,rolling_mean_24
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-12-24 05:00:00,1.631900,0.108467,245.835667,7.533333,0.0,0.466667,0.000000,1.895200,5,6,12,0.965926,2.588190e-01,-0.781831,0.623490,1.576100,1.606767,1.496800,4.222889,2.867040
2006-12-24 06:00:00,1.895200,0.050667,245.590833,8.446667,0.0,0.000000,0.000000,1.476400,6,6,12,1.000000,6.123234e-17,-0.781831,0.623490,1.631900,1.576100,3.938167,3.632200,2.872669
2006-12-24 07:00:00,1.476400,0.099667,242.795333,6.970000,0.0,0.516667,0.000000,1.427767,7,6,12,0.965926,-2.588190e-01,-0.781831,0.623490,1.895200,1.631900,4.548667,3.400233,2.787546
2006-12-24 08:00:00,1.427767,0.052267,241.698667,6.596667,0.0,0.000000,0.000000,2.725667,8,6,12,0.866025,-5.000000e-01,-0.781831,0.623490,1.476400,1.895200,3.065067,3.268567,2.659535
2006-12-24 09:00:00,2.725667,0.093900,239.889833,11.810000,0.0,0.383333,14.383333,2.675333,9,6,12,0.707107,-7.071068e-01,-0.781831,0.623490,1.427767,1.476400,2.419200,3.056467,2.591314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-11-26 16:00:00,1.067933,0.215633,240.457833,4.610000,0.0,0.950000,0.000000,1.725900,16,4,11,-0.866025,-5.000000e-01,-0.433884,-0.900969,0.876433,1.407767,0.541800,1.622867,1.221832
2010-11-26 17:00:00,1.725900,0.061400,237.069667,7.216667,0.0,0.000000,12.866667,1.573467,17,4,11,-0.965926,-2.588190e-01,-0.433884,-0.900969,1.067933,0.876433,1.480100,1.469700,1.243754
2010-11-26 18:00:00,1.573467,0.053700,237.531833,6.620000,0.0,0.000000,0.000000,1.659333,18,4,11,-1.000000,-1.836970e-16,-0.433884,-0.900969,1.725900,1.067933,2.211600,0.824267,1.253996
2010-11-26 19:00:00,1.659333,0.060033,236.741000,7.056667,0.0,0.066667,0.000000,1.163700,19,4,11,-0.965926,2.588190e-01,-0.433884,-0.900969,1.573467,1.725900,2.330467,1.018467,1.227407


In [13]:
train_size = int(len(df) * 0.8)

train = df.iloc[:train_size]
test = df.iloc[train_size:]

X_train = train.drop(columns=['target'])
y_train = train['target']

X_test = test.drop(columns=['target'])
y_test = test['target']

In [14]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [15]:
model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=8,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.2,
    reg_alpha=0.5,
    reg_lambda=2,
    random_state=42,
    eval_metric="rmse"
)

In [16]:
model.fit(
    X_train,
    y_train_log,
    eval_set=[(X_test, y_test_log)],
    verbose=False
)

In [17]:
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)

In [18]:
y_pred

array([0.41852924, 0.7777443 , 1.2951758 , ..., 2.2256052 , 2.137791  ,
       1.655482  ], dtype=float32)

In [19]:
epsilon = 1e-5

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + epsilon))) * 100
r2 = r2_score(y_test, y_pred)

results = pd.DataFrame({
    "Metric": ["RMSE", "MAE", "MAPE (%)", "R2 Score"],
    "Value": [rmse, mae, mape, r2]
})

results["Value"] = results["Value"].round(4)

results.style.background_gradient(cmap="Blues")

Unnamed: 0,Metric,Value
0,RMSE,0.4315
1,MAE,0.2944
2,MAPE (%),35.5286
3,R2 Score,0.6391


In [20]:
smape = np.mean(
    2 * np.abs(y_test - y_pred) /
    (np.abs(y_test) + np.abs(y_pred) + 1e-5)
) * 100

print("SMAPE:", smape)

SMAPE: 30.708029353063914


### Interpretation
The XGBoost model achieves a MAPE of approximately 35% and an SMAPE of around 31%, indicating moderate predictive accuracy for next-hour household electricity demand. With an R² score of 0.64, the model explains about 64% of the variability in consumption, demonstrating that it captures a substantial portion of the underlying temporal patterns. The relatively high percentage error reflects the inherent volatility and appliance-driven fluctuations characteristic of household electricity usage. Given the noisy and unpredictable nature of the data, this level of performance is considered sufficient for next-hour forecasting when relying solely on historical consumption information.

### save model

In [21]:
import pickle

with open("xgb_energy_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")

Model saved successfully!


In [22]:
feature_columns = X_train.columns

with open("feature_columns.pkl", "wb") as f:
    pickle.dump(feature_columns, f)