In [6]:
# === Cell 1: Imports & load dataset ===
import pandas as pd
import numpy as np
from pathlib import Path

DATA = Path("D:/AIT ML/Deployment_Energy404/Rooftop-Solar-Potential-Predictor/dataset/dataset.parquet")
df = pd.read_parquet(DATA)

print("Shape:", df.shape)
print("Unique cities:", df["City"].nunique())
print("Unique building types:", df["BuildingType"].nunique())
print("\nCounts per BuildingType:\n", df["BuildingType"].value_counts())


Shape: (249428, 12)
Unique cities: 20
Unique building types: 10

Counts per BuildingType:
 BuildingType
commercial                   40000
industrial                   40000
single family residential    40000
multifamily residential      40000
public sector                40000
peri-urban settlement        16960
schools                      14596
public health facilities      8009
hotels                        7493
small commercial              2370
Name: count, dtype: int64


In [7]:
# === Cell 2: Feature setup, target clipping, interactions ===

TARGET = "kWh_per_m2"
CAT = ["BuildingType"]
BASE_NUM = [
    "tilt","tilt2","tilt_sin","tilt_cos",
    "GHI_kWh_per_m2_day","AvgTemp_C",
    "ClearnessIndex","Precip_mm_per_day"
]

# Start with base numeric features
df_feat = df.copy()

# --- Target clipping (outlier control) ---
y_raw = df_feat[TARGET].astype(float)
low_q, high_q = y_raw.quantile([0.01, 0.99])
print(f"Target clip range: [{low_q:.2f}, {high_q:.2f}] kWh/mÂ²/yr")

df_feat[TARGET] = y_raw.clip(low_q, high_q)

# --- Simple interaction features (no new external data) ---
df_feat["tilt_x_GHI"]      = df_feat["tilt"] * df_feat["GHI_kWh_per_m2_day"]
df_feat["temp_sq"]         = df_feat["AvgTemp_C"] ** 2
df_feat["clear_x_tiltcos"] = df_feat["ClearnessIndex"] * df_feat["tilt_cos"]
df_feat["precip_x_clear"]  = df_feat["Precip_mm_per_day"] * (1.0 - df_feat["ClearnessIndex"])

NUM = BASE_NUM + ["tilt_x_GHI", "temp_sq", "clear_x_tiltcos", "precip_x_clear"]

print("Using numeric features:\n", NUM)


Target clip range: [210.48, 346.72] kWh/mÂ²/yr
Using numeric features:
 ['tilt', 'tilt2', 'tilt_sin', 'tilt_cos', 'GHI_kWh_per_m2_day', 'AvgTemp_C', 'ClearnessIndex', 'Precip_mm_per_day', 'tilt_x_GHI', 'temp_sq', 'clear_x_tiltcos', 'precip_x_clear']


In [8]:
# === Cell 3: Build X, y, encoded X ===

X = df_feat[NUM + CAT].copy()
y = df_feat[TARGET].copy()

# Categorical for LightGBM
for c in CAT:
    X[c] = X[c].astype("category")

# Encoded version for XGB / RF / ExtraTrees
X_encoded = X.copy()
X_encoded["BuildingType"] = X_encoded["BuildingType"].cat.codes

# Log-transform target (after clipping)
y_log = np.log1p(y)

print("X shape:", X.shape)
print("Encoded X shape:", X_encoded.shape)


X shape: (249428, 13)
Encoded X shape: (249428, 13)


In [9]:
# === Cell 4: Define model parameters ===
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Slightly tuned parameters (more expressive than your earlier baseline)
lgb_base_params = dict(
    objective='mae',
    n_estimators=3000,
    learning_rate=0.02,
    num_leaves=31,
    min_child_samples=60,
    lambda_l1=1.0,
    lambda_l2=1.0,
    subsample=0.8,
    colsample_bytree=0.8
)

xgb_base_params = dict(
    objective='reg:squarederror',
    n_estimators=2500,
    learning_rate=0.02,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
)

rf_params = dict(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)

et_params = dict(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)

# Seeds for bagging LGB & XGB
SEEDS = [42, 1337]


In [11]:
# === Cell 5: 5-fold GroupKFold stacking ensemble with bagging ===

cv = GroupKFold(n_splits=5)
mae_scores = []
oof = []

for fold, (tr, va) in enumerate(cv.split(X, y_log, groups=df_feat["City"]), 1):
    print(f"\n===== Fold {fold} =====")
    X_tr_lgb, X_va_lgb = X.iloc[tr], X.iloc[va]
    X_tr_enc, X_va_enc = X_encoded.iloc[tr], X_encoded.iloc[va]
    y_tr, y_va = y_log.iloc[tr], y_log.iloc[va]

    # --- Bagged LightGBM & XGB ---
    pred_lgb_list = []
    pred_xgb_list = []

    for seed in SEEDS:
        # LightGBM
        lgb = LGBMRegressor(random_state=seed, **lgb_base_params)
        lgb.fit(
            X_tr_lgb, y_tr,
            eval_set=[(X_va_lgb, y_va)],
            callbacks=[early_stopping(stopping_rounds=200, verbose=False)]
        )
        pred_lgb_list.append(np.expm1(lgb.predict(X_va_lgb)))

        # XGBoost
        xgb = XGBRegressor(random_state=seed, **xgb_base_params)
        xgb.fit(
            X_tr_enc, y_tr,
            eval_set=[(X_va_enc, y_va)],
            verbose=False
        )
        pred_xgb_list.append(np.expm1(xgb.predict(X_va_enc)))

    # Average over seeds
    pred_lgb = np.mean(pred_lgb_list, axis=0)
    pred_xgb = np.mean(pred_xgb_list, axis=0)

    # --- RF & ExtraTrees (single seed) ---
    rf = RandomForestRegressor(**rf_params)
    et = ExtraTreesRegressor(**et_params)

    rf.fit(X_tr_enc, y_tr)
    et.fit(X_tr_enc, y_tr)

    pred_rf = np.expm1(rf.predict(X_va_enc))
    pred_et = np.expm1(et.predict(X_va_enc))

    # --- Meta-learner (Ridge) ---
    meta_X = np.column_stack([pred_lgb, pred_xgb, pred_rf, pred_et])
    y_va_lin = np.expm1(y_va)  # back to original scale for meta

    meta = Ridge(alpha=0.5)
    meta.fit(meta_X, y_va_lin)
    stacked = meta.predict(meta_X)

    y_true = y_va_lin
    mae = mean_absolute_error(y_true, stacked)
    mae_scores.append(mae)
    print(f"Fold {fold} MAE = {mae:.3f}")

    # Store OOF for analysis
    oof.append(pd.DataFrame({
        "City": df_feat.loc[va, "City"].values,
        "BuildingType": df_feat.loc[va, "BuildingType"].values,
        "y_true": y_true,
        "y_pred": stacked
    }))

print(f"\nðŸŽ¯ Final Stacked Ensemble MAE (5-fold, tuned + bagged): {np.mean(mae_scores):.3f} Â± {np.std(mae_scores):.3f}")

oof_df = pd.concat(oof, ignore_index=True)
print("Overall OOF MAE:", mean_absolute_error(oof_df["y_true"], oof_df["y_pred"]))



===== Fold 1 =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1642
[LightGBM] [Info] Number of data points in the train set: 202186, number of used features: 13
[LightGBM] [Info] Start training from score 5.610222
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1642
[LightGBM] [Info] Number of data points in the train set: 202186, number of used features: 13
[LightGBM] [Info] Start training from score 5.610222
Fold 1 MAE = 8.662

===== Fold 2 =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1636
[LightGBM] [Info] Number of data points in the train set: 1

KeyboardInterrupt: 

In [None]:
# === Cell 6: MAE by BuildingType ===
type_mae = (
    oof_df.groupby("BuildingType", group_keys=False)
          .apply(lambda d: mean_absolute_error(d["y_true"], d["y_pred"]))
          .round(2)
)
print("\nMAE by BuildingType:")
print(type_mae)



MAE by BuildingType:
BuildingType
commercial                   13.51
hotels                       10.65
industrial                   10.86
multifamily residential      12.20
peri-urban settlement         7.70
public health facilities     13.13
public sector                12.39
schools                      14.01
single family residential    15.06
small commercial             13.67
dtype: float64


  .apply(lambda d: mean_absolute_error(d["y_true"], d["y_pred"]))


In [12]:
import pandas as pd

df = pd.read_parquet("D:/AIT ML/Deployment_Energy404/Rooftop-Solar-Potential-Predictor/dataset/top20_balanced_by_type.parquet")
df["City"].unique()


array(['Colombo', 'Maldives', 'Karachi', 'Beirut', 'Antigua', 'Izmir',
       'Honduras', 'Panama', 'Nairobi', 'Lagos', 'LagosState',
       'Samarkand', 'Accra', 'Mexico City', 'SouthAfrica', 'DarEsSalaam',
       'Almaty', 'Manila', 'GreatDhakaRegion', 'Grenada'], dtype=object)

In [13]:
# === Cell 7: Train final ensemble on FULL dataset for deployment ===
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
import numpy as np

# Reuse your tuned parameters
lgb_base_params = dict(
    objective='mae',
    n_estimators=3000,
    learning_rate=0.02,
    num_leaves=31,
    min_child_samples=60,
    lambda_l1=1.0,
    lambda_l2=1.0,
    subsample=0.8,
    colsample_bytree=0.8
)
xgb_base_params = dict(
    objective='reg:squarederror',
    n_estimators=2500,
    learning_rate=0.02,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
)
rf_params = dict(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)
et_params = dict(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)

# Train each base model on the full log-transformed target
print("ðŸ”§ Training final models on full dataset...")
lgb_models = []
xgb_models = []
rf_models = []
et_models = []

for seed in [42, 1337]:
    lgb = LGBMRegressor(random_state=seed, **lgb_base_params)
    lgb.fit(X, y_log)
    lgb_models.append(lgb)

    xgb = XGBRegressor(random_state=seed, **xgb_base_params)
    xgb.fit(X_encoded, y_log)
    xgb_models.append(xgb)

rf = RandomForestRegressor(**rf_params)
rf.fit(X_encoded, y_log)
rf_models.append(rf)

et = ExtraTreesRegressor(**et_params)
et.fit(X_encoded, y_log)
et_models.append(et)

# Prepare meta learner on full data (using stacked predictions)
print("ðŸ”§ Training meta-learner (Ridge)...")
pred_lgb_full = np.mean([np.expm1(m.predict(X)) for m in lgb_models], axis=0)
pred_xgb_full = np.mean([np.expm1(m.predict(X_encoded)) for m in xgb_models], axis=0)
pred_rf_full  = np.expm1(rf.predict(X_encoded))
pred_et_full  = np.expm1(et.predict(X_encoded))

meta_X_full = np.column_stack([pred_lgb_full, pred_xgb_full, pred_rf_full, pred_et_full])
meta_y_full = np.expm1(y_log)

meta_model = Ridge(alpha=0.5)
meta_model.fit(meta_X_full, meta_y_full)

print("âœ… Finished training all components for deployment.")


ðŸ”§ Training final models on full dataset...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1658
[LightGBM] [Info] Number of data points in the train set: 249428, number of used features: 13
[LightGBM] [Info] Start training from score 5.628721
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1658
[LightGBM] [Info] Number of data points in the train set: 249428, number of used features: 13
[LightGBM] [Info] Start training from score 5.628721
ðŸ”§ Training meta-learner (Ridge)...
âœ… Finished training all components for deployment.


In [14]:
# === Optional: sanity check MAE on training data (not for validation) ===
from sklearn.metrics import mean_absolute_error

# Generate predictions from final ensemble
pred_lgb_full = np.mean([np.expm1(m.predict(X)) for m in lgb_models], axis=0)
pred_xgb_full = np.mean([np.expm1(m.predict(X_encoded)) for m in xgb_models], axis=0)
pred_rf_full  = np.expm1(rf_models[0].predict(X_encoded))
pred_et_full  = np.expm1(et_models[0].predict(X_encoded))

meta_X_full = np.column_stack([pred_lgb_full, pred_xgb_full, pred_rf_full, pred_et_full])
pred_final = meta_model.predict(meta_X_full)

mae_check = mean_absolute_error(np.expm1(y_log), pred_final)
print(f"ðŸ§© Sanity check MAE (on training data, not validation): {mae_check:.3f} kWh/mÂ²")


ðŸ§© Sanity check MAE (on training data, not validation): 5.251 kWh/mÂ²


In [15]:
# === Cell 8: Export trained ensemble models for deployment ===
import joblib
from pathlib import Path

models_dir = Path("..") / "models"
models_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(lgb_models, models_dir / "lgb_models.pkl")
joblib.dump(xgb_models, models_dir / "xgb_models.pkl")
joblib.dump(rf_models,  models_dir / "rf_models.pkl")
joblib.dump(et_models,  models_dir / "et_models.pkl")
joblib.dump(meta_model, models_dir / "meta_model.pkl")

config = {
    "NUM": NUM,
    "CAT": CAT,
    "BuildingType_categories": list(X["BuildingType"].cat.categories),
}
joblib.dump(config, models_dir / "feature_config.pkl")

print("âœ… All model components saved to:", models_dir.resolve())


âœ… All model components saved to: D:\AIT ML\Deployment_Energy404\Rooftop-Solar-Potential-Predictor\models
