In [1]:
# -*- coding: utf-8 -*-
"""
Production Quantity Modeling — Multi-Scenario by Process Stages (FINAL+SAVES)
- 요구사항: 시나리오별 학습/평가 + 모든 산출물 저장
"""

import os, re, json, warnings, inspect
warnings.filterwarnings("ignore")

from typing import Optional, Dict, Tuple, List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.base import clone

from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

import joblib

# ===== Optional boosters =====
have_xgb = have_lgbm = have_cat = False
try:
    from xgboost import XGBRegressor
    have_xgb = True
except Exception:
    pass
try:
    from lightgbm import LGBMRegressor
    have_lgbm = True
except Exception:
    pass
try:
    from catboost import CatBoostRegressor
    have_cat = True
except Exception:
    pass

# ===== PyTorch (AI 모델 1개) =====
have_torch = False
try:
    import torch
    import torch.nn as nn
    have_torch = True
except Exception:
    pass

# =====================
# 0) CONFIG
# =====================
FILE_PATH   = r"Final Results Extended.csv"   # ← 네 CSV 경로로 수정
TARGET_COL  = "c_TotalProducts"
RANDOM_SEED = 42
OUT_DIR     = "outputs"; os.makedirs(OUT_DIR, exist_ok=True)
PROGRESS_PERIOD = 50  # booster 로그 주기

# Prefilter thresholds
DROP_MISSING_PCT = 0.95
DROP_ZERO_PCT    = 0.95
DROP_NZV_VAR     = 1e-8

def log(*args, **kwargs):
    kwargs.setdefault('flush', True)
    print(*args, **kwargs)

def save_fig(path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

# =====================
# 1) LOAD & BASIC CLEAN
# =====================
df = pd.read_csv(FILE_PATH, low_memory=False)
df.columns = [re.sub(r"__+", "_", c.strip()) for c in df.columns]

if TARGET_COL not in df.columns:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        raise ValueError("No numeric columns and target not found.")
    TARGET_COL = df[num_cols].var().sort_values(ascending=False).index[0]
    print(f"[WARN] '{TARGET_COL}'을(를) 타깃으로 가정합니다. 실제 타깃을 설정하세요.")

# =====================
# 2) DATETIME DETECTION
# =====================
def is_datetime_like(s: pd.Series, sample:int=500) -> bool:
    vals = s.dropna().astype(str).head(sample)
    if len(vals) == 0: return False
    parsed = pd.to_datetime(vals, errors="coerce", infer_datetime_format=True)
    return parsed.notna().mean() >= 0.8

dt_col: Optional[str] = None
for c in df.columns:
    if c == TARGET_COL: continue
    hint = bool(re.search(r"(date|time|timestamp)", c, re.I))
    if hint or df[c].dtype == "object":
        if is_datetime_like(df[c]):
            dt_col = c
            df[c] = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
            break

if dt_col is not None:
    parsed = pd.to_datetime(df[dt_col], errors="coerce")
    df["year"]      = parsed.dt.year
    df["month"]     = parsed.dt.month
    df["day"]       = parsed.dt.day
    df["dayofweek"] = parsed.dt.dayofweek

# =====================
# 3) PREFILTER
# =====================
def prefilter_features(frame: pd.DataFrame, target: str):
    feats = frame.drop(columns=[target])
    report, drop = [], set()

    num_cols = feats.select_dtypes(include=[np.number]).columns
    cat_cols = feats.select_dtypes(exclude=[np.number]).columns

    # Numeric
    for c in num_cols:
        s = pd.to_numeric(feats[c], errors='coerce')
        n = len(s)
        miss = float(s.isna().mean())
        zero = float((s.fillna(0) == 0).mean())
        uniq = int(s.nunique(dropna=True))
        var  = float(s.var()) if n > 1 else 0.0
        sug = (miss >= DROP_MISSING_PCT) or (uniq <= 1) or (zero >= DROP_ZERO_PCT) or (var <= DROP_NZV_VAR)
        if sug: drop.add(c)
        report.append({'column': c,'dtype':'numeric','missing_pct':miss,'zero_pct':zero,'unique':uniq,'variance':var,'suggest_drop':bool(sug)})

    # Categorical
    for c in cat_cols:
        s = feats[c].astype('object')
        miss = float(s.isna().mean())
        uniq = int(s.nunique(dropna=True))
        sug = (miss >= DROP_MISSING_PCT) or (uniq <= 1)
        if sug: drop.add(c)
        report.append({'column': c,'dtype':'categorical','missing_pct':miss,'zero_pct':np.nan,'unique':uniq,'variance':np.nan,'suggest_drop':bool(sug)})

    rep_df = pd.DataFrame(report).sort_values(['suggest_drop','missing_pct'], ascending=[False, False])
    rep_df.to_csv(os.path.join(OUT_DIR, 'prefilter_report.csv'), index=False)
    print(f"Prefilter: {len(drop)} columns dropped. Report → outputs/prefilter_report.csv")

    return feats.drop(columns=list(drop)), list(drop)

X_prefilt, dropped_cols = prefilter_features(df, TARGET_COL)
y_all = pd.to_numeric(df[TARGET_COL], errors="coerce")
X_all = X_prefilt.copy()

# 상수/빈 컬럼 제거
bad_cols = [c for c in X_all.columns if X_all[c].isna().all() or X_all[c].nunique(dropna=True) <= 1]
if bad_cols:
    X_all = X_all.drop(columns=bad_cols)
    dropped_cols.extend(bad_cols)

# =====================
# 3.5) LEAKAGE SCAN
# =====================
AUTO_DROP_LEAKS   = True
LEAK_CORR_THRESH  = 0.999
EQUAL_TOL         = 1e-9

def leakage_scan(Xdf: pd.DataFrame, yser: pd.Series):
    rows = []
    yv = pd.to_numeric(yser, errors='coerce')
    for c in Xdf.columns:
        s = Xdf[c]
        equal_flag = False
        corr_val = np.nan
        try:
            diff = (pd.to_numeric(s, errors='coerce') - yv).abs()
            equal_flag = bool(np.nanmax(diff.values) <= EQUAL_TOL)
        except Exception:
            equal_flag = False
        try:
            s_num = pd.to_numeric(s, errors='coerce')
            if s_num.notna().sum() > 3 and yv.notna().sum() > 3:
                corr_val = float(np.corrcoef(s_num.fillna(s_num.median()), yv.fillna(yv.median()))[0,1])
        except Exception:
            corr_val = np.nan
        rows.append({'column': c, 'equal_to_target': equal_flag, 'pearson_corr_to_target': corr_val})
    rep = pd.DataFrame(rows)
    rep['abs_corr'] = rep['pearson_corr_to_target'].abs()
    rep['suspect'] = rep['equal_to_target'] | (rep['abs_corr'] >= LEAK_CORR_THRESH)
    rep.sort_values(['suspect','abs_corr'], ascending=[False, False]).to_csv(os.path.join(OUT_DIR, 'leakage_report_overall.csv'), index=False)
    suspects = rep.loc[rep['suspect'], 'column'].tolist()
    return suspects

overall_suspects = leakage_scan(X_all, y_all)

# =====================
# 4) PROCESS STAGES & SCENARIOS
# =====================
STAGE_PATTERNS: Dict[str, List[str]] = {
    "blanking":       [r"^Blanking", r"\bBlanking\b"],
    "forklift_b":     [r"Forklift_Blan?king", r"Forklift.*Blank"],
    "press":          [r"^Press[1-4]_"],
    "forklift_p":     [r"Forklift_Press"],
    "warehouse":      [r"^Warehouse[1-4]_"],
    "assembly_cell":  [r"^Cell[1-4]_", r"^c_Cell", r"^c_Cycle", r"^c_.*SKU", r"^SKU[1-4]_"],
    "forklift_a":     [r"Forklift_Assembly"],
    "paint":          [r"^Paint[12]_"],
    "quality":        [r"^Quality_"]
}

def cols_of_stages(cols: List[str], stages: List[str]) -> List[str]:
    picked = []
    for st in stages:
        pats = STAGE_PATTERNS.get(st, [])
        for p in pats:
            rgx = re.compile(p, re.I)
            picked.extend([c for c in cols if rgx.search(c)])
    return sorted(list(dict.fromkeys(picked)))

ALL_STAGES = ["blanking","forklift_b","press","forklift_p","warehouse","assembly_cell","forklift_a","paint","quality"]

SCENARIOS = [
    {"name": "All",                    "include": ALL_STAGES,                                                                                 "exclude": []},
    {"name": "NoQuality",              "include": [s for s in ALL_STAGES if s != "quality"],                                                  "exclude": []},
    {"name": "NoQualityPaint",         "include": [s for s in ALL_STAGES if s not in {"quality","paint"}],                                    "exclude": []},
    {"name": "NoQualityPaintAssembly", "include": [s for s in ALL_STAGES if s not in {"quality","paint","assembly_cell"}],                    "exclude": []},
    {"name": "BlankingPressOnly",      "include": ["blanking","press"],                                                                       "exclude": []},
    {"name": "BlankingOnly",           "include": ["blanking"],                                                                               "exclude": []},
]

# =====================
# 5) PREPROCESSORS
# =====================
def make_preprocessors(X: pd.DataFrame):
    num_features = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

    num_tf_plain   = Pipeline([('imp', SimpleImputer(strategy='median'))])
    num_tf_scaled  = Pipeline([('imp', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

    # OneHot: 버전 호환
    try:
        cat_ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=0.01, sparse_output=False)
    except TypeError:
        try:
            cat_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        except TypeError:
            cat_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
    cat_tf = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('ohe', cat_ohe)])

    transformers = []
    if len(num_features) > 0:
        transformers.append(('num', num_tf_plain, num_features))
    if len(cat_features) > 0:
        transformers.append(('cat', cat_tf, cat_features))
    preprocess_plain  = ColumnTransformer(transformers, remainder='drop')

    transformers_s = []
    if len(num_features) > 0:
        transformers_s.append(('num', num_tf_scaled, num_features))
    if len(cat_features) > 0:
        transformers_s.append(('cat', cat_tf, cat_features))
    preprocess_scaled = ColumnTransformer(transformers_s, remainder='drop')

    return preprocess_plain, preprocess_scaled, num_features, cat_features

# =====================
# 6) MODEL ZOO (+ Torch MLP)
# =====================
seed = RANDOM_SEED
def get_models():
    models: Dict[str, object] = {
        'LinearRegression': LinearRegression(),
        'RidgeCV': RidgeCV(alphas=np.logspace(-3,3,20)),
        'ElasticNetCV': ElasticNetCV(l1_ratio=[0.1,0.5,0.9], alphas=np.logspace(-3,1,10), max_iter=5000, cv=3, random_state=seed),
        'RandomForest': RandomForestRegressor(n_estimators=300, random_state=seed, n_jobs=-1)
    }
    if have_xgb:
        models['XGBoost'] = XGBRegressor(
            n_estimators=5000, learning_rate=0.05, max_depth=6,
            subsample=0.8, colsample_bytree=0.8,
            tree_method='gpu_hist', predictor='gpu_predictor',
            n_jobs=-1, random_state=seed
        )
    if have_lgbm:
        models['LightGBM'] = LGBMRegressor(n_estimators=5000, learning_rate=0.05,
                                           num_leaves=31, subsample=0.8, colsample_bytree=0.8,
                                           random_state=seed)
    if have_cat:
        models['CatBoost'] = CatBoostRegressor(iterations=5000, depth=6, learning_rate=0.05,
                                               loss_function='RMSE', random_seed=seed,
                                               verbose=PROGRESS_PERIOD, od_type='Iter',
                                               od_wait=100, use_best_model=True)
    # ML 메인 유지: sklearn MLP
    models['SkMLP(epochs=50)'] = MLPRegressor(hidden_layer_sizes=(128,64), activation='relu', solver='adam',
                                              max_iter=50, early_stopping=True, validation_fraction=0.15,
                                              n_iter_no_change=10, random_state=seed, verbose=False)
    return models

NEED_SCALING = {'LinearRegression','RidgeCV','ElasticNetCV','SkMLP(epochs=50)'}

def build_pipe(model_name: str, estimator, preprocess_plain, preprocess_scaled):
    prep = preprocess_scaled if model_name in NEED_SCALING else preprocess_plain
    return Pipeline([('prep', prep), ('model', estimator)]), prep

def evaluate(y_true, y_pred) -> Tuple[float,float,float]:
    mae = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2 = float(r2_score(y_true, y_pred))
    return mae, rmse, r2

def fit_accepts(estimator, param: str) -> bool:
    try:
        sig = inspect.signature(estimator.fit)
        return param in sig.parameters
    except Exception:
        return False

# ---------- Torch MLP ----------
class TorchMLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x).squeeze(1)

def prep_numeric_for_torch(X_train, X_valid, X_test, cols):
    """Torch 전용: 수치열만 골라 Inf→NaN → median 채움 → 표준화."""
    from sklearn.preprocessing import StandardScaler
    Xt = X_train[cols].copy()
    Xv = X_valid[cols].copy()
    Xs = X_test[cols].copy()

    for df_ in (Xt, Xv, Xs):
        df_.replace([np.inf, -np.inf], np.nan, inplace=True)

    med = Xt.median(numeric_only=True)
    Xt = Xt.fillna(med); Xv = Xv.fillna(med); Xs = Xs.fillna(med)

    scaler = StandardScaler()
    Xt_n = scaler.fit_transform(Xt)
    Xv_n = scaler.transform(Xv)
    Xs_n = scaler.transform(Xs)

    return (pd.DataFrame(Xt_n, index=X_train.index),
            pd.DataFrame(Xv_n, index=X_valid.index),
            pd.DataFrame(Xs_n, index=X_test.index),
            scaler, list(cols))

def train_torch_mlp(X_train, y_train, X_valid, y_valid, scenario_dir, max_epochs=50, lr=1e-3, batch=2048, seed=RANDOM_SEED):
    if not have_torch:
        return None, {}
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    Xtr = torch.tensor(X_train.astype(np.float32).values, device=device)
    ytr = torch.tensor(y_train.values.astype(np.float32), device=device)
    Xva = torch.tensor(X_valid.astype(np.float32).values, device=device)
    yva = torch.tensor(y_valid.values.astype(np.float32), device=device)

    model = TorchMLP(Xtr.shape[1]).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = nn.MSELoss()

    best_state = None
    best_rmse = float('inf')
    patience = 8; wait=0

    for ep in range(1, max_epochs+1):
        model.train()
        idx = torch.randperm(Xtr.shape[0])
        for i in range(0, Xtr.shape[0], batch):
            b = idx[i:i+batch]
            xb, yb = Xtr[b], ytr[b]
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        model.eval()
        with torch.no_grad():
            p = model(Xva)
            rmse = float(torch.sqrt(loss_fn(p, yva)).item())
        if rmse < best_rmse - 1e-4:
            best_rmse = rmse
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    torch_path = os.path.join(scenario_dir, "model_TorchMLP.pt")
    torch.save(model.state_dict(), torch_path)
    return model.to('cpu'), {"best_valid_RMSE": best_rmse, "saved_path": torch_path}

# =====================
# 7) 시나리오별 실행 루프
# =====================
def run_scenario(scn: Dict):
    name = scn["name"]
    include_stages = scn.get("include", [])
    exclude_stages = scn.get("exclude", [])

    scenario_dir = os.path.join(OUT_DIR, "scenarios", name)
    os.makedirs(scenario_dir, exist_ok=True)

    # ----- 피처 선택 -----
    cols = list(X_all.columns)
    include_cols = cols_of_stages(cols, include_stages)
    if exclude_stages:
        exclude_cols = cols_of_stages(cols, exclude_stages)
        include_cols = [c for c in include_cols if c not in exclude_cols]

    Xscenario = X_all[include_cols].copy()
    y = pd.to_numeric(y_all, errors="coerce")

    # NaN 방어 (y NaN 제거)
    mask = ~y.isna()
    Xscenario = Xscenario.loc[mask]
    y = y.loc[mask]

    # 상수 피처 제거
    if len(Xscenario.columns) == 0:
        print(f"[WARN] Scenario '{name}' has 0 features after selection.")
        return
    desc = Xscenario.describe().T
    keep = desc.index[desc["std"].fillna(0) > 0].tolist()
    Xscenario = Xscenario[keep]
    dropped_in_scenario = [c for c in include_cols if c not in keep]

    # 누수 의심 제거
    scenario_suspects = [c for c in overall_suspects if c in Xscenario.columns]
    if AUTO_DROP_LEAKS and scenario_suspects:
        Xscenario = Xscenario.drop(columns=scenario_suspects)

    # ----- 분할 -----
    if dt_col is not None:
        order = df.loc[Xscenario.index, dt_col].sort_values().index
        Xs, ys = Xscenario.loc[order], y.loc[order]
        n = len(Xs)
        n_tr = int(n*0.70); n_va = int(n*0.15)
        X_train, y_train = Xs.iloc[:n_tr], ys.iloc[:n_tr]
        X_valid, y_valid = Xs.iloc[n_tr:n_tr+n_va], ys.iloc[n_tr:n_tr+n_va]
        X_test,  y_test  = Xs.iloc[n_tr+n_va:], ys.iloc[n_tr+n_va:]
    else:
        X_fill = Xscenario.copy()
        for c in X_fill.columns:
            if X_fill[c].dtype.kind not in 'biufc':
                X_fill[c] = X_fill[c].astype(str)
        row_hash = pd.util.hash_pandas_object(X_fill.fillna('NA'), index=False).astype('int64')
        gss1 = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=RANDOM_SEED)
        idx_trv, idx_te = next(gss1.split(Xscenario, y, groups=row_hash))
        X_trv, y_trv, g_trv = Xscenario.iloc[idx_trv], y.iloc[idx_trv], row_hash.iloc[idx_trv]
        X_test, y_test = Xscenario.iloc[idx_te], y.iloc[idx_te]

        gss2 = GroupShuffleSplit(n_splits=1, test_size=0.1765, random_state=RANDOM_SEED)
        idx_tr, idx_va = next(gss2.split(X_trv, y_trv, groups=g_trv))
        X_train, y_train = X_trv.iloc[idx_tr], y_trv.iloc[idx_tr]
        X_valid, y_valid = X_trv.iloc[idx_va], y_trv.iloc[idx_va]

    # ----- 전처리기 -----
    preprocess_plain, preprocess_scaled, num_features, cat_features = make_preprocessors(Xscenario)

    # ----- 모델들 -----
    models = get_models()

    results = []
    trained_pipes: Dict[str, Pipeline] = {}
    preds_valid, preds_test = {}, {}

    # ===== 학습/평가 루프 =====
    for mname, mdl in models.items():
        print(f"\n[Scenario: {name}] ▶ Training {mname} ...")
        pipe, prep = build_pipe(mname, mdl, preprocess_plain, preprocess_scaled)

        # booster early stopping용 변환 검증셋
        Xv_t = None
        if mname in {'XGBoost','LightGBM','CatBoost'}:
            prep_eval = clone(prep)
            prep_eval.fit(X_train, y_train)
            Xv_t = prep_eval.transform(X_valid)

        fit_kwargs = {}
        if mname in {'XGBoost','LightGBM','CatBoost'} and Xv_t is not None:
            fit_kwargs['model__eval_set'] = [(Xv_t, y_valid)] if mname != 'CatBoost' else (Xv_t, y_valid)

            if mname == 'XGBoost':
                if fit_accepts(mdl, 'eval_metric'): fit_kwargs['model__eval_metric'] = 'rmse'
                if fit_accepts(mdl, 'early_stopping_rounds'): fit_kwargs['model__early_stopping_rounds'] = 100
                elif fit_accepts(mdl, 'callbacks'):
                    try:
                        import xgboost as xgb
                        fit_kwargs['model__callbacks'] = [
                            xgb.callback.EarlyStopping(rounds=100, save_best=True),
                            xgb.callback.EvaluationMonitor(period=PROGRESS_PERIOD)
                        ]
                    except Exception: pass
                if fit_accepts(mdl, 'verbose'): fit_kwargs['model__verbose'] = True

            elif mname == 'LightGBM':
                if fit_accepts(mdl, 'early_stopping_rounds'): fit_kwargs['model__early_stopping_rounds'] = 100
                if fit_accepts(mdl, 'eval_metric'): fit_kwargs['model__eval_metric'] = 'rmse'
                if fit_accepts(mdl, 'callbacks'):
                    try:
                        import lightgbm as lgb
                        fit_kwargs['model__callbacks'] = [
                            lgb.early_stopping(100, verbose=False),
                            lgb.log_evaluation(PROGRESS_PERIOD)
                        ]
                    except Exception: pass
                if fit_accepts(mdl, 'verbose'): fit_kwargs['model__verbose'] = True

        try:
            pipe.fit(X_train, y_train, **fit_kwargs)
            yv = pipe.predict(X_valid);  yt = pipe.predict(X_test)
            v_mae, v_rmse, v_r2 = evaluate(y_valid, yv)
            t_mae, t_rmse, t_r2 = evaluate(y_test,  yt)

            results.append({'model': mname,'val_MAE': v_mae,'val_RMSE': v_rmse,'val_R2': v_r2,
                            'test_MAE': t_mae,'test_RMSE': t_rmse,'test_R2': t_r2})
            trained_pipes[mname] = pipe
            preds_valid[mname] = yv
            preds_test[mname]  = yt

            joblib.dump(pipe, os.path.join(scenario_dir, f"model_{mname}.pkl"))
            print(f"[OK] {mname}: test_R2={t_r2:.4f}")

        except Exception as e:
            print(f"[SKIP] {mname}: {e}")
            results.append({'model': mname, 'error': str(e)})

    # ===== Torch MLP (AI) =====
    torch_info = {}
    if have_torch:
        num_cols_only = Xscenario.select_dtypes(include=[np.number]).columns
        if len(num_cols_only) > 0:
            Xtr_n, Xva_n, Xte_n, scaler, num_list = prep_numeric_for_torch(X_train, X_valid, X_test, num_cols_only)
            torch_model, torch_info = train_torch_mlp(
                Xtr_n, y_train.reset_index(drop=True),
                Xva_n, y_valid.reset_index(drop=True),
                scenario_dir, max_epochs=50, lr=1e-3, batch=2048
            )
            if torch_model is not None:
                import torch
                torch_model.eval()
                with torch.no_grad():
                    yv = torch_model(torch.tensor(Xva_n.astype(np.float32).values)).cpu().numpy()
                    yt = torch_model(torch.tensor(Xte_n.astype(np.float32).values)).cpu().numpy()
                yv = np.nan_to_num(yv, nan=np.nanmedian(yv))
                yt = np.nan_to_num(yt, nan=np.nanmedian(yt))
                v_mae, v_rmse, v_r2 = evaluate(y_valid, yv)
                t_mae, t_rmse, t_r2 = evaluate(y_test,  yt)
                results.append({'model':'TorchMLP','val_MAE':v_mae,'val_RMSE':v_rmse,'val_R2':v_r2,
                                'test_MAE':t_mae,'test_RMSE':t_rmse,'test_R2':t_r2,'note': torch_info})
                preds_valid['TorchMLP'] = yv
                preds_test['TorchMLP']  = yt
                joblib.dump({'scaler': scaler, 'num_cols': num_list}, os.path.join(scenario_dir, "torch_preproc.pkl"))
        else:
            print("[Torch] No numeric columns in scenario; skipping TorchMLP.")

    # ===== 메트릭/리더보드 저장 =====
    metrics = pd.DataFrame(results)
    metric_cols = ['model','val_MAE','val_RMSE','val_R2','test_MAE','test_RMSE','test_R2','error','note']
    metrics = metrics.reindex(columns=[c for c in metric_cols if c in metrics.columns])
    if 'test_R2' in metrics.columns:
        metrics = metrics.sort_values(by='test_R2', ascending=False, na_position='last')
    for c in ['val_MAE','val_RMSE','val_R2','test_MAE','test_RMSE','test_R2']:
        if c in metrics.columns:
            metrics[c] = pd.to_numeric(metrics[c], errors='coerce').round(4)
    metrics_path = os.path.join(scenario_dir, 'metrics.csv')
    metrics.to_csv(metrics_path, index=False)

    # 리더보드 PNG
    if 'test_R2' in metrics.columns and not metrics.dropna(subset=['test_R2']).empty:
        rank = metrics.dropna(subset=['test_R2']).copy().sort_values('test_R2', ascending=True)
        plt.figure(figsize=(7, max(3, 0.6*len(rank))))
        plt.barh(rank['model'], rank['test_R2'])
        plt.xlabel('Test R²'); plt.title(f'Leaderboard — {name}')
        save_fig(os.path.join(scenario_dir, 'leaderboard_testR2.png'))

    # ===== 베스트 모델 산출물 (예측/플롯) =====
    if 'test_R2' in metrics.columns and not metrics.dropna(subset=['test_R2']).empty:
        best_name = metrics.iloc[0]['model']
        y_pred_test = preds_test[best_name]
        out_df = pd.DataFrame({'y_true': y_test.values, 'y_pred': y_pred_test}, index=X_test.index)
        if dt_col is not None:
            out_df[dt_col] = df.loc[X_test.index, dt_col]
        out_df.to_csv(os.path.join(scenario_dir, f"best_predictions__{best_name}.csv"), index=False)

        # 산점도
        plt.figure(figsize=(6,4))
        plt.scatter(y_test, y_pred_test, alpha=0.6)
        plt.xlabel("y_true (test)"); plt.ylabel("y_pred (test)")
        plt.title(f"Scatter | {best_name} | {name} | R2={r2_score(y_test, y_pred_test):.3f}")
        save_fig(os.path.join(scenario_dir, f"scatter_{best_name}.png"))

        # 잔차
        resid = y_test.values - y_pred_test
        plt.figure(figsize=(8,3))
        plt.plot(resid)
        plt.xlabel("test index"); plt.ylabel("residual")
        plt.title(f"Residuals | {best_name} | {name}")
        save_fig(os.path.join(scenario_dir, f"residuals_{best_name}.png"))

        # 시간 플롯(있을 때만)
        if dt_col is not None:
            tvals = df.loc[X_test.index, dt_col]
            order = np.argsort(pd.to_datetime(tvals).values.astype('datetime64[ns]'))
            plt.figure(figsize=(8,3))
            plt.plot(tvals.values[order], y_test.values[order], label='Actual')
            plt.plot(tvals.values[order], y_pred_test[order], label='Pred')
            plt.legend(); plt.xlabel(str(dt_col)); plt.ylabel(TARGET_COL)
            plt.title(f"Actual vs Pred over time | {best_name} | {name}")
            save_fig(os.path.join(scenario_dir, f"timeplot_{best_name}.png"))

    # ===== 각 모델별 Permutation Importance 저장 =====
    def get_feature_names(prep: ColumnTransformer):
        names = []
        if 'num' in prep.named_transformers_:
            try:
                names += list(prep.named_transformers_['num'].feature_names_in_)
            except Exception:
                # fallback
                for tname, trans, cols in prep.transformers_:
                    if tname == 'num': names += list(cols)
        if 'cat' in prep.named_transformers_:
            cat_cols = []
            for tname, trans, cols in prep.transformers_:
                if tname == 'cat': cat_cols = list(cols)
            ohe = prep.named_transformers_['cat'].named_steps.get('ohe')
            try:
                names += list(ohe.get_feature_names_out(cat_cols))
            except Exception:
                names += cat_cols
        return names
        

    # Permutation Importance 실행 여부 설정
    ENABLE_PERM_IMPORTANCE = False   # True로 바꾸면 다시 실행됨
    
    for mname, pipe in list(trained_pipes.items()):
        try:
            if ENABLE_PERM_IMPORTANCE:
                perm = permutation_importance(pipe, X_valid, y_valid,
                                              n_repeats=1,   # 반복 최소화
                                              n_jobs=1)      # 병렬 대신 직렬
                perm_means = pd.Series(perm.importances_mean,
                                       index=feats if not isinstance(pipe[-1], TorchMLP) else X_train.columns)
                print(f"[INFO] {mname} PermImp: top5 -> {perm_means.sort_values(ascending=False).head().to_dict()}")
        except Exception as e:
            print(f"[FAIL] PermImp({mname}): {e}")
    

    # ===== 사용/제외 컬럼, 설정 저장 =====
    used_cols = list(Xscenario.columns)
    excluded_cols = [c for c in X_all.columns if c not in used_cols]
    pd.DataFrame({'feature': used_cols}).to_csv(os.path.join(scenario_dir, 'features_used.csv'), index=False)
    pd.DataFrame({'feature': excluded_cols}).to_csv(os.path.join(scenario_dir, 'features_excluded.csv'), index=False)

    config = {
        "scenario": name,
        "include_stages": include_stages,
        "exclude_stages": exclude_stages,
        "dt_col": dt_col,
        "target": TARGET_COL,
        "random_seed": RANDOM_SEED,
        "rows_total": int(len(X_all)),
        "rows_used": int(len(Xscenario)),
        "suspect_leakage_dropped": scenario_suspects if AUTO_DROP_LEAKS else [],
        "dropped_prefilter": dropped_cols,
        "dropped_in_scenario_constant": dropped_in_scenario
    }
    with open(os.path.join(scenario_dir, 'config.json'), 'w', encoding='utf-8') as f:
        json.dump(config, f, ensure_ascii=False, indent=2)

    print(f"[DONE] Scenario '{name}' — outputs at: {scenario_dir}")

# =====================
# 8) MAIN LOOP — run all scenarios
# =====================
if __name__ == "__main__":
    for scn in SCENARIOS:
        print("\n" + "="*60)
        print(f"Running scenario: {scn['name']}")
        print("="*60)
        run_scenario(scn)
    print("\nAll scenarios done. Check 'outputs/scenarios/<name>/' folders.")

Prefilter: 22 columns dropped. Report → outputs/prefilter_report.csv

Running scenario: All

[Scenario: All] ▶ Training LinearRegression ...
[OK] LinearRegression: test_R2=0.9996

[Scenario: All] ▶ Training RidgeCV ...
[OK] RidgeCV: test_R2=0.9993

[Scenario: All] ▶ Training ElasticNetCV ...
[OK] ElasticNetCV: test_R2=0.9991

[Scenario: All] ▶ Training RandomForest ...
[OK] RandomForest: test_R2=1.0000

[Scenario: All] ▶ Training SkMLP(epochs=50) ...
[OK] SkMLP(epochs=50): test_R2=0.9998
[DONE] Scenario 'All' — outputs at: outputs\scenarios\All

Running scenario: NoQuality

[Scenario: NoQuality] ▶ Training LinearRegression ...
[OK] LinearRegression: test_R2=0.9993

[Scenario: NoQuality] ▶ Training RidgeCV ...
[OK] RidgeCV: test_R2=0.9993

[Scenario: NoQuality] ▶ Training ElasticNetCV ...
[OK] ElasticNetCV: test_R2=0.9993

[Scenario: NoQuality] ▶ Training RandomForest ...
[OK] RandomForest: test_R2=0.9992

[Scenario: NoQuality] ▶ Training SkMLP(epochs=50) ...
[OK] SkMLP(epochs=50): test