In [2]:
# ==============================================
# Phase 1: Forecasting ML Pipeline with Prophet
# Personalized Home Energy Saver (Global Model, Per-Appliance)
# ==============================================

# ------------------------
# 0) Imports & Setup
# ------------------------
import os
import json
import math
import pickle
import numpy as np
import pandas as pd
from typing import Dict, Any
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Paths and configs
DATA_PATH = "./data/smart_home_energy_consumption.csv"  # Update if needed
ARTIFACT_DIR = "./artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ------------------------
# 1) Load & Type Coercion
# ------------------------
EXPECTED_COLS = [
    "Home ID", "Appliance Type", "Energy Consumption (kWh)",
    "Time", "Date", "Outdoor Temperature (°C)",
    "Season", "Household Size"
]
df_raw = pd.read_csv(DATA_PATH)
missing = [c for c in EXPECTED_COLS if c not in df_raw.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Standardize column names
df = df_raw.rename(columns={
    "Home ID": "home_id",
    "Appliance Type": "appliance",
    "Energy Consumption (kWh)": "kwh",
    "Time": "time",
    "Date": "date",
    "Outdoor Temperature (°C)": "temp_c",
    "Season": "season",
    "Household Size": "hh_size",
})

# Enforce dtypes
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["time"] = pd.to_datetime(df["time"], format="%H:%M", errors="coerce").dt.time
df["kwh"] = pd.to_numeric(df["kwh"], errors="coerce")
df["temp_c"] = pd.to_numeric(df["temp_c"], errors="coerce")
df["hh_size"] = pd.to_numeric(df["hh_size"], errors="coerce")

# Drop unusable rows
df = df.dropna(subset=["date", "time", "kwh", "appliance"]).copy()

In [7]:
df[(df['home_id']==94) & (df['appliance']=='Fridge')]

Unnamed: 0,home_id,appliance,kwh,time,date,temp_c,season,hh_size
0,94,Fridge,0.2,21:12:00,2023-12-02,-1.0,Fall,2
1476,94,Fridge,0.33,18:44:00,2023-05-30,27.8,Spring,4
6020,94,Fridge,0.12,03:45:00,2023-01-24,-5.8,Winter,1
7784,94,Fridge,0.21,21:27:00,2023-09-17,8.1,Summer,5
16358,94,Fridge,0.39,07:03:00,2023-11-15,23.2,Fall,5
19654,94,Fridge,0.49,22:14:00,2023-12-04,14.6,Fall,4
21888,94,Fridge,0.18,12:58:00,2023-03-01,-3.8,Winter,5
27674,94,Fridge,0.45,13:58:00,2023-05-31,9.3,Spring,5
31750,94,Fridge,0.48,21:26:00,2023-07-27,7.1,Summer,5
34154,94,Fridge,0.38,03:07:00,2023-01-27,-5.5,Winter,4


In [8]:

# ------------------------
# 2) Combine to Timestamp & Basic Cleaning
# ------------------------
# Timestamp
df["timestamp"] = pd.to_datetime(
    df["date"].dt.strftime("%Y-%m-%d") + " " + df["time"].astype(str), errors="coerce"
)
df = df.dropna(subset=["timestamp"]).copy()

# Sanity bounds
df = df[(df["kwh"] >= 0) & (df["kwh"] <= 50)].copy()
df = df[df["hh_size"].between(1, 20, inclusive="both") | df["hh_size"].isna()].copy()

# Season imputation (if missing)
month_to_season = {
    12: "Winter",1:  "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Fall", 10: "Fall", 11: "Fall",
}
df["month"] = df["date"].dt.month
df["season"] = df["season"].fillna(df["month"].map(month_to_season))

In [9]:
df

Unnamed: 0,home_id,appliance,kwh,time,date,temp_c,season,hh_size,timestamp,month
0,94,Fridge,0.20,21:12:00,2023-12-02,-1.0,Fall,2,2023-12-02 21:12:00,12
1,435,Oven,0.23,20:11:00,2023-08-06,31.1,Summer,5,2023-08-06 20:11:00,8
2,466,Dishwasher,0.32,06:39:00,2023-11-21,21.3,Fall,3,2023-11-21 06:39:00,11
3,496,Heater,3.92,21:56:00,2023-01-21,-4.2,Winter,1,2023-01-21 21:56:00,1
4,137,Microwave,0.44,04:31:00,2023-08-26,34.5,Summer,5,2023-08-26 04:31:00,8
...,...,...,...,...,...,...,...,...,...,...
99995,124,Microwave,0.42,09:56:00,2023-09-28,20.5,Summer,1,2023-09-28 09:56:00,9
99996,184,Computer,0.71,12:48:00,2023-05-27,-5.4,Spring,2,2023-05-27 12:48:00,5
99997,101,Dishwasher,0.25,05:45:00,2023-02-18,35.6,Winter,3,2023-02-18 05:45:00,2
99998,423,Air Conditioning,2.69,12:39:00,2023-04-20,3.7,Spring,1,2023-04-20 12:39:00,4


In [10]:
# ------------------------
# 3) Daily Aggregation (per home, appliance, date)
# ------------------------
agg = (
    df
    .groupby(["home_id", "appliance", "date"], as_index=False)
    .agg(
        daily_kwh=("kwh", "sum"),
        avg_temp=("temp_c", "mean"),
        min_temp=("temp_c", "min"),
        max_temp=("temp_c", "max"),
        hh_size=("hh_size", "max"),     # latest/max for the day
        events=("kwh", "count")
    )
)

# Calendar features for later use
agg["dow"] = agg["date"].dt.dayofweek                # 0=Mon
agg["is_weekend"] = agg["dow"].isin([5, 6]).astype(int)
agg["month"] = agg["date"].dt.month
agg["season"] = agg["month"].map(month_to_season)


In [11]:
agg

Unnamed: 0,home_id,appliance,date,daily_kwh,avg_temp,min_temp,max_temp,hh_size,events,dow,is_weekend,month,season
0,1,Air Conditioning,2023-01-07,2.77,-3.3,-3.3,-3.3,3,1,5,1,1,Winter
1,1,Air Conditioning,2023-01-14,4.80,2.1,2.1,2.1,4,1,5,1,1,Winter
2,1,Air Conditioning,2023-01-15,3.13,1.0,1.0,1.0,2,1,6,1,1,Winter
3,1,Air Conditioning,2023-01-19,3.10,32.4,32.4,32.4,4,1,3,0,1,Winter
4,1,Air Conditioning,2023-01-29,4.16,34.6,34.6,34.6,3,1,6,1,1,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97417,500,Washing Machine,2023-11-04,1.20,-4.1,-4.1,-4.1,4,1,5,1,11,Fall
97418,500,Washing Machine,2023-11-05,0.74,13.6,13.6,13.6,3,1,6,1,11,Fall
97419,500,Washing Machine,2023-11-18,1.02,2.6,2.6,2.6,5,1,5,1,11,Fall
97420,500,Washing Machine,2023-12-24,1.66,12.7,12.7,12.7,4,1,6,1,12,Winter


In [12]:
# ------------------------
# 4) Cross-Home Appliance-Level Daily Series (Prophet-ready)
# ------------------------
# Aggregate across homes to learn generalized appliance behavior
appliance_daily = (
    agg
    .groupby(["appliance", "date"], as_index=False)
    .agg(
        y=("daily_kwh", "mean"),          # target series
        avg_temp=("avg_temp", "mean"),
        hh_size=("hh_size", "mean"),
        is_weekend=("is_weekend", "max")
    )
    .rename(columns={"date": "ds"})
    .sort_values(["appliance", "ds"])
)

# Drop rows lacking core fields
appliance_daily = appliance_daily.dropna(subset=["ds", "y", "avg_temp"])

In [13]:
appliance_daily

Unnamed: 0,appliance,ds,y,avg_temp,hh_size,is_weekend
0,Air Conditioning,2023-01-01,3.733448,15.896552,3.034483,1
1,Air Conditioning,2023-01-02,3.922308,17.776923,3.269231,0
2,Air Conditioning,2023-01-03,3.571905,10.247619,2.952381,0
3,Air Conditioning,2023-01-04,3.718333,14.780000,3.233333,0
4,Air Conditioning,2023-01-05,3.623750,14.743750,2.925000,0
...,...,...,...,...,...,...
3655,Washing Machine,2023-12-28,1.206957,13.878261,2.478261,0
3656,Washing Machine,2023-12-29,1.109655,15.262069,2.827586,0
3657,Washing Machine,2023-12-30,1.240000,18.755000,3.400000,1
3658,Washing Machine,2023-12-31,1.181667,11.820000,3.266667,1


In [14]:
# ------------------------
# 5) Time-Aware Split
# ------------------------
all_ds = np.sort(appliance_daily["ds"].unique())
cut_idx = int(0.8 * len(all_ds))
split_date = all_ds[cut_idx] if cut_idx < len(all_ds) else all_ds[-1]
print(f"Time split: Train <= {pd.to_datetime(split_date).date()} | Valid > split")

Time split: Train <= 2023-10-20 | Valid > split


In [15]:
# ------------------------
# 6) Metrics (sMAPE)
# ------------------------
def smape(y_true, y_pred) -> float:
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    denom = np.where(denom == 0, 1.0, denom)
    return float(np.mean(diff / denom) * 100.0)

In [16]:
# ------------------------
# 7) Prophet Factory
# ------------------------
def make_prophet():
    """
    Daily/weekly/yearly seasonalities with external regressors.
    """
    m = Prophet(
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=True,
        seasonality_mode="additive"   # try "multiplicative" later if level-scaled seasonality
    )
    m.add_regressor("avg_temp", standardize=True)
    m.add_regressor("hh_size", standardize=True)
    m.add_regressor("is_weekend")     # binary; Prophet handles as-is
    return m


In [17]:
# ------------------------
# 8) Train, Validate, Save per-appliance models
# ------------------------
model_registry: Dict[str, Dict[str, Any]] = {}
scores = []

for app in sorted(appliance_daily["appliance"].unique()):
    df_app = appliance_daily[appliance_daily["appliance"] == app].copy()
    if df_app["ds"].nunique() < 60:
        print(f"Skipping '{app}': insufficient history (<60 days).")
        continue

    train_df = df_app[df_app["ds"] <= split_date][["ds","y","avg_temp","hh_size","is_weekend"]].dropna()
    valid_df = df_app[df_app["ds"] >  split_date][["ds","y","avg_temp","hh_size","is_weekend"]].dropna()

    if len(valid_df) < 7:
        print(f"Skipping '{app}': too small validation window.")
        continue

    m = make_prophet()
    m.fit(train_df)

    fc_valid = m.predict(valid_df[["ds","avg_temp","hh_size","is_weekend"]])
    y_true = valid_df["y"].values
    y_pred = fc_valid["yhat"].values

    mae  = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    s    = smape(y_true, y_pred)

    print(f"[{app}] N={len(y_true)}  MAE={mae:.4f} kWh  RMSE={rmse:.4f} kWh  sMAPE={s:.2f}%")
    scores.append({"appliance": app, "n": len(y_true), "mae": mae, "rmse": rmse, "smape": s})

    # Save model
    safe_app = app.replace(" ", "_").lower()
    model_path = os.path.join(ARTIFACT_DIR, f"prophet_{safe_app}.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(m, f)

    # Register
    model_registry[app] = {
        "model_path": model_path,
        "regressors": ["avg_temp", "hh_size", "is_weekend"],
        "split_date": str(pd.to_datetime(split_date).date())
    }

# Weighted metrics across appliances
if scores:
    total_n = sum(s["n"] for s in scores)
    w_mae   = sum(s["mae"]  * s["n"] for s in scores) / total_n
    w_rmse  = sum(s["rmse"] * s["n"] for s in scores) / total_n
    w_smape = sum(s["smape"]* s["n"] for s in scores) / total_n
    print("\n=== Weighted Validation Metrics ===")
    print(f"MAE={w_mae:.4f} kWh | RMSE={w_rmse:.4f} kWh | sMAPE={w_smape:.2f}%")

# ------------------------
# 9) Save Clean Registry
# ------------------------
REGISTRY_PATH = os.path.join(ARTIFACT_DIR, "prophet_registry.json")

# Normalize paths before saving
clean_registry = {}
for app, meta in model_registry.items():
    raw_path = meta["model_path"]
    # Ensure forward slashes and remove redundant folder prefixes
    clean_path = raw_path.replace("\\", "/").replace("artifacts/", "").replace("./", "")
    clean_registry[app] = {**meta, "model_path": clean_path}

with open(REGISTRY_PATH, "w") as f:
    json.dump(clean_registry, f, indent=2)

print(f"\n✅ Saved cleaned model registry → {REGISTRY_PATH}")



08:25:39 - cmdstanpy - INFO - Chain [1] start processing
08:25:39 - cmdstanpy - INFO - Chain [1] done processing
08:25:39 - cmdstanpy - INFO - Chain [1] start processing
08:25:39 - cmdstanpy - INFO - Chain [1] done processing


[Air Conditioning] N=73  MAE=0.1571 kWh  RMSE=0.2011 kWh  sMAPE=4.41%
[Computer] N=73  MAE=0.1139 kWh  RMSE=0.1406 kWh  sMAPE=10.07%


08:25:40 - cmdstanpy - INFO - Chain [1] start processing
08:25:40 - cmdstanpy - INFO - Chain [1] done processing
08:25:40 - cmdstanpy - INFO - Chain [1] start processing
08:25:40 - cmdstanpy - INFO - Chain [1] done processing


[Dishwasher] N=73  MAE=0.1539 kWh  RMSE=0.1804 kWh  sMAPE=13.82%
[Fridge] N=73  MAE=0.0223 kWh  RMSE=0.0270 kWh  sMAPE=7.35%


08:25:40 - cmdstanpy - INFO - Chain [1] start processing
08:25:40 - cmdstanpy - INFO - Chain [1] done processing
08:25:40 - cmdstanpy - INFO - Chain [1] start processing
08:25:40 - cmdstanpy - INFO - Chain [1] done processing


[Heater] N=73  MAE=0.2502 kWh  RMSE=0.3035 kWh  sMAPE=7.20%
[Lights] N=73  MAE=0.5814 kWh  RMSE=0.6604 kWh  sMAPE=60.99%


08:25:40 - cmdstanpy - INFO - Chain [1] start processing
08:25:40 - cmdstanpy - INFO - Chain [1] done processing
08:25:40 - cmdstanpy - INFO - Chain [1] start processing
08:25:40 - cmdstanpy - INFO - Chain [1] done processing


[Microwave] N=73  MAE=0.1120 kWh  RMSE=0.1332 kWh  sMAPE=10.08%
[Oven] N=73  MAE=0.0862 kWh  RMSE=0.1099 kWh  sMAPE=7.73%


08:25:41 - cmdstanpy - INFO - Chain [1] start processing
08:25:41 - cmdstanpy - INFO - Chain [1] done processing
08:25:41 - cmdstanpy - INFO - Chain [1] start processing
08:25:41 - cmdstanpy - INFO - Chain [1] done processing


[TV] N=73  MAE=0.1014 kWh  RMSE=0.1284 kWh  sMAPE=8.94%
[Washing Machine] N=73  MAE=0.1249 kWh  RMSE=0.1577 kWh  sMAPE=11.26%

=== Weighted Validation Metrics ===
MAE=0.1703 kWh | RMSE=0.2042 kWh | sMAPE=14.19%

✅ Saved cleaned model registry → ./artifacts\prophet_registry.json


In [18]:
from pathlib import Path

# ------------------------
# 10) Inference Helper (Next-Day)
# ------------------------
def predict_next_day(
    appliance: str,
    ds_next: str,           # "YYYY-MM-DD"
    avg_temp: float,
    hh_size: float,
    is_weekend: int         # 0/1
) -> Dict[str, Any]:
    """
    Load the per-appliance Prophet model and forecast next-day kWh.
    Supports registry model paths saved as relative (e.g., prophet_air_conditioning.pkl)
    or absolute paths.
    """
    # Load registry
    with open(REGISTRY_PATH) as f:
        reg = json.load(f)

    if appliance not in reg:
        raise ValueError(f"No Prophet model registered for appliance: '{appliance}'")

    # Resolve path to model
    raw_path = reg[appliance]["model_path"]
    model_path = Path(raw_path)

    # If the model path is not absolute, assume it's relative to ARTIFACT_DIR
    if not model_path.is_absolute():
        model_path = ARTIFACT_DIR / model_path

    if not model_path.exists():
        raise FileNotFoundError(f"Model file not found: {model_path}")

    # Load Prophet model
    with open(model_path, "rb") as f:
        m: Prophet = pickle.load(f)

    # Build future row with required regressors
    future = pd.DataFrame([{
        "ds": pd.to_datetime(ds_next),
        "avg_temp": float(avg_temp),
        "hh_size": float(hh_size),
        "is_weekend": int(is_weekend)
    }])

    fc = m.predict(future)
    return {
        "appliance": appliance,
        "date": pd.to_datetime(ds_next).strftime("%Y-%m-%d"),
        "yhat": float(fc.loc[0, "yhat"]),
        "yhat_lower": float(fc.loc[0, "yhat_lower"]),
        "yhat_upper": float(fc.loc[0, "yhat_upper"])
    }


In [19]:
# ------------------------
# 11) Example Inference
# ------------------------
ex = predict_next_day(
    appliance="Washing Machine",
    ds_next=str((pd.Timestamp.today().normalize() + pd.Timedelta(days=1)).date()),
    avg_temp=10.0,
    hh_size=3,
    is_weekend=1
)
ex

{'appliance': 'Washing Machine',
 'date': '2025-11-13',
 'yhat': 1.0527026292690926,
 'yhat_lower': 0.9128864729453346,
 'yhat_upper': 1.1888741792228854}