In [9]:
# Blending for PEMFC with V as target
# ------------------------------------------------------------
# Models: LightGBM, XGBoost, GradientBoosting, RandomForest
# Robust to categorical columns + missing values
# Falls back gracefully if LightGBM/XGBoost are unavailable
# ------------------------------------------------------------

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Optional models (import if available)
HAS_LGBM = True
HAS_XGB  = True
try:
    import lightgbm as lgb
except Exception:
    HAS_LGBM = False

try:
    from xgboost import XGBRegressor
except Exception:
    HAS_XGB = False

RANDOM_STATE = 42
TEST_SIZE = 0.2
N_ESTIMATORS = 800  # safe default; you can tune

# -----------------------------
# 1) Load data
# -----------------------------
CSV_PATH = r"C:\D-drive\Coding\Major-Project\new_\data\preprocessed_data.csv"
df = pd.read_csv(CSV_PATH)

if "V" not in df.columns:
    raise ValueError(
        f"'V' column not found in {CSV_PATH}. "
        f"Available columns: {list(df.columns)}"
    )

# Drop rows with missing target; coerce to numeric if needed
df = df.copy()
df["V"] = pd.to_numeric(df["V"], errors="coerce")
df = df.dropna(subset=["V"])

# Features/Target
X = df.drop(columns=["V"])
y = df["V"].astype(float)

# Identify column types
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# -----------------------------
# 2) Preprocessor
# -----------------------------
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

if len(cat_cols) > 0:
    cat_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        # Using dense output for compatibility with sklearn ensembles
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_transformer, num_cols),
            ("cat", cat_transformer, cat_cols),
        ]
    )
else:
    # Only numeric columns -> simple numeric pipeline
    preprocessor = num_transformer

# -----------------------------
# 3) Define models (pipelines)
# -----------------------------
models = {}

# Gradient Boosting (sklearn)
models["GradientBoosting"] = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", GradientBoostingRegressor(
        random_state=RANDOM_STATE,
        n_estimators=N_ESTIMATORS,
        learning_rate=0.05,
        max_depth=3
    ))
])

# Random Forest (sklearn)
models["RandomForest"] = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestRegressor(
        random_state=RANDOM_STATE,
        n_estimators=N_ESTIMATORS,
        max_depth=None,
        n_jobs=-1
    ))
])

# LightGBM (optional)
if HAS_LGBM:
    models["LightGBM"] = Pipeline(steps=[
        ("prep", preprocessor),
        ("model", lgb.LGBMRegressor(
            random_state=RANDOM_STATE,
            n_estimators=N_ESTIMATORS,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            n_jobs=-1
        ))
    ])

# XGBoost (optional)
if HAS_XGB:
    models["XGBoost"] = Pipeline(steps=[
        ("prep", preprocessor),
        ("model", XGBRegressor(
            random_state=RANDOM_STATE,
            n_estimators=N_ESTIMATORS,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.8,
            max_depth=8,
            reg_lambda=1.0,
            tree_method="hist",
            n_jobs=-1
        ))
    ])

if len(models) == 0:
    raise RuntimeError(
        "No models available. Install at least scikit-learn. "
        "Optional: pip install lightgbm xgboost"
    )

print(f"Using base models: {list(models.keys())}")

# -----------------------------
# 4) Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# -----------------------------
# 5) Fit models & predict
# -----------------------------
preds = {}
fitted = {}

for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    fitted[name] = pipe
    preds[name] = pipe.predict(X_test)

# -----------------------------
# 6) Equal-weight blending
# -----------------------------
# stack predictions into shape (n_models, n_samples)
pred_matrix = np.vstack([preds[name] for name in models.keys()])
blend_pred = pred_matrix.mean(axis=0)

# -----------------------------
# 7) Evaluation helpers
# -----------------------------
def metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)   # older sklearn doesn't support squared=
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, r2, mae

# Blended metrics
b_rmse, b_r2, b_mae = metrics(y_test, blend_pred)

print("\n===== Blended Model (Equal Weights) =====")
print(f"RMSE: {b_rmse:.6f}")
print(f"R²:   {b_r2:.6f}")
print(f"MAE:  {b_mae:.6f}")

# Individual model metrics
print("\n===== Individual Base Models =====")
for name in models.keys():
    rmse, r2, mae = metrics(y_test, preds[name])
    print(f"{name:16s} RMSE={rmse:.6f}  R²={r2:.6f}  MAE={mae:.6f}")

# -----------------------------
# 8) Quick CV stability check (OOF equal-weight blend)
# -----------------------------
print("\n===== 5-Fold CV (OOF) — Equal-Weight Blend =====")
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

oof_blend = np.zeros(len(y))
for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    fold_preds = []
    for name, proto in models.items():
        # clone-like rebuild to avoid leakage of fitted state
        # re-create pipeline with the same parameters
        # (simple refit of the same object also works here since we overwrite fit)
        pipe = proto
        pipe.fit(X_tr, y_tr)
        fold_preds.append(pipe.predict(X_va))

    fold_pred_matrix = np.vstack(fold_preds)
    oof_blend[va_idx] = fold_pred_matrix.mean(axis=0)

cv_rmse, cv_r2, cv_mae = metrics(y, oof_blend)
print(f"CV RMSE: {cv_rmse:.6f}")
print(f"CV R²:   {cv_r2:.6f}")
print(f"CV MAE:  {cv_mae:.6f}")


Using base models: ['GradientBoosting', 'RandomForest', 'LightGBM', 'XGBoost']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1192, number of used features: 8
[LightGBM] [Info] Start training from score 415.424670





===== Blended Model (Equal Weights) =====
RMSE: 0.690940
R²:   0.999865
MAE:  0.320501

===== Individual Base Models =====
GradientBoosting RMSE=0.821444  R²=0.999810  MAE=0.412600
RandomForest     RMSE=1.096253  R²=0.999661  MAE=0.455239
LightGBM         RMSE=0.808763  R²=0.999815  MAE=0.370419
XGBoost          RMSE=0.947344  R²=0.999747  MAE=0.391671

===== 5-Fold CV (OOF) — Equal-Weight Blend =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1192, number of used features: 8
[LightGBM] [Info] Start training from score 415.424670




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1193, number of used features: 8
[LightGBM] [Info] Start training from score 416.701113




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1193, number of used features: 8
[LightGBM] [Info] Start training from score 415.146387




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1193, number of used features: 8
[LightGBM] [Info] Start training from score 415.083492




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1193, number of used features: 8
[LightGBM] [Info] Start training from score 417.539125




CV RMSE: 0.648372
CV R²:   0.999891
CV MAE:  0.269187
