# Energy Consumption Prediction Project

This notebook loads the UCI household electricity dataset, preprocesses it, and builds forecasting models.

In [None]:
# DATA DOWNLOAD & LOAD
import os
import zipfile
import pandas as pd
from urllib.request import urlretrieve

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
zip_path = os.path.join(DATA_DIR, "household_power_consumption.zip")
csv_name = "household_power_consumption.txt"

if not os.path.exists(zip_path):
    print("Downloading dataset...")
    urlretrieve(url, zip_path)
else:
    print("Dataset already downloaded.")

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(DATA_DIR)

data_path = os.path.join(DATA_DIR, csv_name)
df = pd.read_csv(data_path, sep=';', parse_dates={'datetime': ['Date', 'Time']},
                 infer_datetime_format=True, na_values=['?'], low_memory=False)
df = df.set_index('datetime').sort_index()
df.head()

In [None]:
# Resample to hourly average
df_hour = df[['Global_active_power']].resample('H').mean()
df_hour = df_hour.ffill().bfill()
df_hour = df_hour.rename(columns={'Global_active_power':'GAP_kW'})
df_hour.head()

In [None]:
# Feature Engineering
df_feat = df_hour.copy()
df_feat['hour'] = df_feat.index.hour
df_feat['dayofweek'] = df_feat.index.dayofweek
df_feat['month'] = df_feat.index.month
df_feat['is_weekend'] = df_feat['dayofweek'].isin([5,6]).astype(int)

for lag in [1,24]:
    df_feat[f'lag_{lag}'] = df_feat['GAP_kW'].shift(lag)

df_feat = df_feat.dropna()
df_feat.head()

In [None]:
# Train-test split
split_date = "2010-01-01"
train = df_feat.loc[:split_date].copy()
test = df_feat.loc[split_date:].copy()

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
def mape(y_true, y_pred): return np.mean(np.abs((y_true - y_pred)/y_true))*100

# Baseline persistence
y_test = test['GAP_kW']
y_pred_persistence = test['lag_1']

print("Persistence MAE:", mean_absolute_error(y_test, y_pred_persistence))
print("Persistence RMSE:", mean_squared_error(y_test, y_pred_persistence, squared=False))
print("Persistence MAPE:", mape(y_test, y_pred_persistence))

In [None]:
# XGBoost Model
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

FEATURES = [c for c in train.columns if c not in ['GAP_kW']]
X_train = train[FEATURES]; y_train = train['GAP_kW']
X_test = test[FEATURES]; y_test = test['GAP_kW']

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

model_xgb = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.05)
model_xgb.fit(X_train_s, y_train)
y_pred_xgb = model_xgb.predict(X_test_s)

print("XGB MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("XGB RMSE:", mean_squared_error(y_test, y_pred_xgb, squared=False))
print("XGB MAPE:", mape(y_test, y_pred_xgb))