In [3]:
# -*- coding: utf-8 -*-
"""
Bottleneck → Throughput 분석 (row-wise max Queue = bottleneck)
- 입력: Final Results Extended.csv (하루 단위)
- 병목 레이블: 각 행에서 Queue 컬럼 중 최댓값의 컬럼명 (bottleneck_col_top1) + 그 값 (bottleneck_val_top1)
- 타깃: c_TotalProducts (없으면 SKU 관련 합산)
- 회귀: RidgeCV, RandomForest 로 성능/중요도 비교
- 분류(옵션): 생산량 상위 10% vs 나머지 (LogisticRegression)
- 산출물: outputs/bottleneck/* 일괄 저장
"""

import os, re, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score, classification_report
import joblib

# --------------------------
# 0) CONFIG
# --------------------------
FILE_PATH  = r"Final Results Extended.csv"   # ← 경로만 바꿔서 사용
TARGET_COL = "c_TotalProducts"               # 기본 타깃
OUT_DIR    = os.path.join("outputs", "bottleneck"); os.makedirs(OUT_DIR, exist_ok=True)

# Queue 컬럼 인식 규칙(필요시 보강: r"_Queue$" 등)
QUEUE_PATTERN = re.compile(r"_Queue", re.I)

# --------------------------
# 1) LOAD
# --------------------------
df = pd.read_csv(FILE_PATH, low_memory=False)

# 타깃 보정
if TARGET_COL not in df.columns:
    sku_cols = [c for c in df.columns if re.search(r"sku", c, re.I)]
    if not sku_cols:
        raise ValueError("타깃 c_TotalProducts가 없고 SKU 관련 컬럼도 없어 타깃을 만들 수 없습니다.")
    df["__target__"] = df[sku_cols].sum(axis=1)
    TARGET = "__target__"
else:
    TARGET = TARGET_COL

y = pd.to_numeric(df[TARGET], errors="coerce")

# --------------------------
# 2) 병목 레이블링 (row-wise idxmax on Queue columns)
# --------------------------
queue_cols = [c for c in df.columns if QUEUE_PATTERN.search(c)]
if not queue_cols:
    raise ValueError("Queue 컬럼을 찾지 못했습니다. QUEUE_PATTERN을 수정하세요.")

# 숫자화 & NaN 무한값 방어
Q = df[queue_cols].apply(pd.to_numeric, errors="coerce").replace([np.inf,-np.inf], np.nan)
# 모두 NaN인 행은 0으로 대체해 idxmax가 동작하도록(대체 룰은 필요시 조정)
Q_filled = Q.fillna(-np.inf)

bottleneck_idx = Q_filled.idxmax(axis=1)
bottleneck_val = Q.max(axis=1, numeric_only=True)

df["bottleneck_col_top1"] = bottleneck_idx
df["bottleneck_val_top1"] = bottleneck_val

# 저장(레이블만)
df[["bottleneck_col_top1","bottleneck_val_top1"]].to_csv(
    os.path.join(OUT_DIR, "bottleneck_labels.csv"), index=False
)

# --------------------------
# 3) 병목별 그룹 통계 (생산량, 결함률 있으면 같이)
# --------------------------
stats = (
    df.groupby("bottleneck_col_top1")[TARGET]
      .agg(["mean","median","std","count"])
      .reset_index()
      .sort_values("mean", ascending=False)
)
overall_mean = float(np.nanmean(y))
stats["lift_vs_overall"] = stats["mean"] - overall_mean
stats.to_csv(os.path.join(OUT_DIR, "bottleneck_impact_summary.csv"), index=False)

# (옵션) 불량 관련 컬럼 자동 탐색
defect_cols = [c for c in df.columns if re.search(r"(defect|scrap|ng|fail|reject)", c, re.I)]
if defect_cols:
    # 수치형만 평균/중앙/건수
    dsum = df.groupby("bottleneck_col_top1")[defect_cols].mean(numeric_only=True)
    dsum.columns = [f"DEFECT_MEAN__{c}" for c in dsum.columns]
    stats2 = df.groupby("bottleneck_col_top1")[defect_cols].median(numeric_only=True)
    stats2.columns = [f"DEFECT_MEDIAN__{c}" for c in stats2.columns]
    defect_summary = pd.concat([stats.set_index("bottleneck_col_top1"), dsum, stats2], axis=1).reset_index()
    defect_summary.to_csv(os.path.join(OUT_DIR, "bottleneck_impact_with_defects.csv"), index=False)

# 시각화: 병목 빈도 & 평균 생산량
plt.figure(figsize=(10,4))
stats_freq = df["bottleneck_col_top1"].value_counts().sort_values(ascending=False)
stats_freq.plot(kind="bar")
plt.title("Bottleneck Frequency by Stage")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "plot_bottleneck_frequency.png"), dpi=150); plt.close()

plt.figure(figsize=(10,4))
plt.bar(stats["bottleneck_col_top1"], stats["mean"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mean Throughput")
plt.title("Mean Throughput by Bottleneck")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "plot_mean_throughput_by_bottleneck.png"), dpi=150); plt.close()

# --------------------------
# 4) 피처에 병목 포함 → 회귀 성능 확인
# --------------------------
# X 기본: 원본에서 타깃 제외
X_base = df.drop(columns=[TARGET])

# 병목 레이블 One-Hot
bn_label = df["bottleneck_col_top1"].fillna("None").astype(str).to_frame()
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
bn_ohe = ohe.fit_transform(bn_label)
bn_ohe_cols = [f"BN_{c}" for c in ohe.categories_[0]]

X = pd.concat(
    [X_base.reset_index(drop=True).drop(columns=["bottleneck_col_top1"], errors="ignore"),
     pd.DataFrame(bn_ohe, columns=bn_ohe_cols)],
    axis=1
)

# 타깃 NaN 제거
mask = ~y.isna()
X, y2 = X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True)

# 간단 전처리: 수치/범주 자동 파이프
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

num_tf = Pipeline([("imp", SimpleImputer(strategy="median")),
                   ("scaler", StandardScaler())])
cat_tf = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                   ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

transformers = []
if num_cols: transformers.append(("num", num_tf, num_cols))
if cat_cols: transformers.append(("cat", cat_tf, cat_cols))
preprocess = ColumnTransformer(transformers, remainder="drop")

# 모델 1: RidgeCV
ridge = Pipeline([("prep", preprocess),
                  ("model", RidgeCV(alphas=np.logspace(-3,3,20)))])
# 모델 2: RandomForest
rf = Pipeline([("prep", preprocess),
               ("model", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))])

X_tr, X_te, y_tr, y_te = train_test_split(X, y2, test_size=0.2, random_state=42)

def eval_reg(pipe, name):
    pipe.fit(X_tr, y_tr)
    pred = pipe.predict(X_te)
    out = {
        "MAE": float(mean_absolute_error(y_te, pred)),
        "RMSE": float(np.sqrt(mean_squared_error(y_te, pred))),
        "R2": float(r2_score(y_te, pred))
    }
    # 저장
    joblib.dump(pipe, os.path.join(OUT_DIR, f"model_{name}.pkl"))
    pd.DataFrame({"y_true": y_te, "y_pred": pred}).to_csv(
        os.path.join(OUT_DIR, f"predictions_{name}.csv"), index=False
    )
    return out

ridge_m = eval_reg(ridge, "RidgeCV_wBN")
rf_m    = eval_reg(rf,    "RF_wBN")

with open(os.path.join(OUT_DIR, "regression_metrics.json"), "w") as f:
    json.dump({"RidgeCV": ridge_m, "RandomForest": rf_m}, f, indent=2)

print("[REG] RidgeCV:", ridge_m)
print("[REG] RandomForest:", rf_m)

# RF 피처 중요도(전처리 후 feature_names 추출)
def get_feature_names(prep: ColumnTransformer) -> list:
    names = []
    # num
    if "num" in prep.named_transformers_:
        try:
            names += list(prep.named_transformers_["num"].named_steps["imp"].get_feature_names_out(num_cols))
        except Exception:
            names += num_cols
    # cat
    if "cat" in prep.named_transformers_:
        cat_ohe = prep.named_transformers_["cat"].named_steps.get("ohe")
        cat_raw = prep.transformers_[1][2] if len(prep.transformers_)>1 else []
        try:
            names += list(cat_ohe.get_feature_names_out(cat_raw))
        except Exception:
            names += cat_raw
    return names

# 다시 학습해서 중요도 뽑기(동일 파이프라인)
rf.fit(X_tr, y_tr)
rf_feat_names = get_feature_names(rf.named_steps["prep"])
if hasattr(rf.named_steps["model"], "feature_importances_"):
    importances = rf.named_steps["model"].feature_importances_
    imp_df = pd.DataFrame({"feature": rf_feat_names, "importance": importances}).sort_values("importance", ascending=False)
    imp_df.to_csv(os.path.join(OUT_DIR, "feature_importance_RF_with_BN.csv"), index=False)

    # 병목 원핫만 보기
    imp_bn = imp_df[imp_df["feature"].str.startswith("cat__ohe__bottleneck_col_top1_") | imp_df["feature"].str.startswith("BN_")]
    if not imp_bn.empty:
        imp_bn.to_csv(os.path.join(OUT_DIR, "feature_importance_RF__BN_only.csv"), index=False)

        plt.figure(figsize=(8, max(3, 0.4*len(imp_bn.head(20)))))
        top = imp_bn.head(20)
        plt.barh(top["feature"][::-1], top["importance"][::-1])
        plt.title("RF Importance — Bottleneck One-Hot (Top20)")
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR, "plot_RF_importance_BN_top20.png"), dpi=150); plt.close()

# --------------------------
# 5) (옵션) 분류: 상위 10% 생산량 vs 나머지
# --------------------------
q90 = float(np.nanpercentile(y2, 90))
cls_y = (y2 >= q90).astype(int)

logreg = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None))
])
logreg.fit(X_tr, (y_tr >= q90).astype(int))
proba = logreg.predict_proba(X_te)[:,1]
auc = roc_auc_score((y_te >= q90).astype(int), proba)
report = classification_report((y_te >= q90).astype(int), (proba>=0.5).astype(int), output_dict=True)

with open(os.path.join(OUT_DIR, "classification_report_top10pct.json"), "w") as f:
    json.dump({"ROC_AUC": auc, "report": report}, f, indent=2)

print(f"[CLS] Top10% ROC-AUC={auc:.3f}")

# --------------------------
# 저장물 안내
# --------------------------
print("\n[Saved files]")
print(" - bottleneck_labels.csv")
print(" - bottleneck_impact_summary.csv", "(+ bottleneck_impact_with_defects.csv if available)")
print(" - plot_bottleneck_frequency.png")
print(" - plot_mean_throughput_by_bottleneck.png")
print(" - model_RidgeCV_wBN.pkl / predictions_RidgeCV_wBN.csv")
print(" - model_RF_wBN.pkl / predictions_RF_wBN.csv")
print(" - regression_metrics.json")
print(" - feature_importance_RF_with_BN.csv")
print(" - feature_importance_RF__BN_only.csv (if any)")
print(" - plot_RF_importance_BN_top20.png (if any)")
print(" - classification_report_top10pct.json")

[REG] RidgeCV: {'MAE': 20.44161865650393, 'RMSE': 24.24855960188918, 'R2': 0.9995147380981386}
[REG] RandomForest: {'MAE': 21.26736662269958, 'RMSE': 25.295925496573954, 'R2': 0.9994719130311346}
[CLS] Top10% ROC-AUC=1.000

[Saved files]
 - bottleneck_labels.csv
 - bottleneck_impact_summary.csv (+ bottleneck_impact_with_defects.csv if available)
 - plot_bottleneck_frequency.png
 - plot_mean_throughput_by_bottleneck.png
 - model_RidgeCV_wBN.pkl / predictions_RidgeCV_wBN.csv
 - model_RF_wBN.pkl / predictions_RF_wBN.csv
 - regression_metrics.json
 - feature_importance_RF_with_BN.csv
 - feature_importance_RF__BN_only.csv (if any)
 - plot_RF_importance_BN_top20.png (if any)
 - classification_report_top10pct.json


In [5]:
# -*- coding: utf-8 -*-
"""
Bottleneck → Throughput + Lead-time 영향 분석 (스테이지 대표 Queue 기반)
- 입력: Final Results Extended.csv
- 병목 레이블: 행 단위로 스테이지별 대표 Queue(예: max(Blanking SKU1~4, Blanking_Queue)) 중 최댓값의 '스테이지'를 bottleneck으로 지정
- 타깃: c_TotalProducts (없으면 SKU 합산)
- 리드타임: SKU1~4 × {VA,NVA,Transport,Wait,Other}_Time (유연한 정규식으로 탐색)
- 모델: RidgeCV, RandomForest (성능/중요도 비교)
- 산출물: outputs/bottleneck/* 에 CSV/PNG/PKL 저장
"""

import os, re, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score, classification_report
import joblib

# --------------------------
# 0) CONFIG
# --------------------------
FILE_PATH  = r"Final Results Extended.csv"   # ← 경로 조정
TARGET_COL = "c_TotalProducts"
OUT_DIR    = os.path.join("outputs", "bottleneck"); os.makedirs(OUT_DIR, exist_ok=True)

# 병목 선택 기준: 'raw' = 원시 Queue값으로 비교, 'zscore' = 스테이지별 z-score로 비교(스케일 보정)
BN_SELECTION = "raw"   # 'raw' or 'zscore'
# Blanking 대표값 산출 전략: 'prefer_sku' = SKU1~4만 사용(요약컬럼이 있어도 무시), 'max_all' = SKU들과 요약컬럼 모두의 최대
BLANKING_STRATEGY = "prefer_sku"  # 'prefer_sku' or 'max_all'

RANDOM_SEED = 42
TEST_SIZE = 0.20
TOP_K_IMP = 20  # 중요도 그래프 top-K

# --------------------------
# 1) LOAD
# --------------------------
df = pd.read_csv(FILE_PATH, low_memory=False)

# 타깃 설정
if TARGET_COL not in df.columns:
    sku_cols_for_target = [c for c in df.columns if re.search(r"\bsku\d+\b", c, re.I)]
    if not sku_cols_for_target:
        raise ValueError("타깃 c_TotalProducts가 없고 SKU 관련 컬럼도 없어 타깃을 만들 수 없습니다.")
    df["__target__"] = df[sku_cols_for_target].apply(pd.to_numeric, errors="coerce").sum(axis=1)
    TARGET = "__target__"
else:
    TARGET = TARGET_COL

y_full = pd.to_numeric(df[TARGET], errors="coerce")

# --------------------------
# 2) 스테이지별 대표 Queue 구성
#    (현재 컬럼명을 최대한 포괄적으로 매칭)
# --------------------------
def pick_cols(patterns: List[str]) -> List[str]:
    out = []
    for col in df.columns:
        lc = col.lower()
        if "queue" not in lc:  # queue가 아닌 건 제외
            continue
        if any(re.search(p, col, re.I) for p in patterns):
            out.append(col)
    return out

# 후보 패턴(필요시 보강)
STAGE_PATTERNS: Dict[str, List[str]] = {
    # Blanking: SKU별 + 요약(Blanking_Queue) 모두 포괄
    "blanking_sku": [r"^Blanking[_-]SKU[1-4]_Queue$"],
    "blanking_all": [r"^Blanking_Queue$"],
    # Press
    "press":        [r"^Press\d*_Queue$","^Press\d+_Queue$","^Press_?\d*_?Queue$","^Press"],  # 느슨하게
    # Assembly(셀) - 있을 수도/없을 수도
    "assembly":     [r"^Cell[1-4]_Queue$", r"^Assembly", r"^c_Cell", r"_Assembly_Queue$"],
    # Warehouse (Warehouse1_Queue, Warehouse_2_Queue 등 혼재를 커버)
    "warehouse":    [r"^Warehouse\d*_?Queue$", r"^Warehouse_?\d+_?Queue$","^Warehouse"],
    # Paint
    "paint":        [r"^Paint\d*_Queue$", r"^Paint"],
    # Quality
    "quality":      [r"^Quality_?Queue$","^Quality"],
    # Forklift (원하면 별도 스테이지로 보고 싶을 때)
    "forklift":     [r"^Forklift_.*_Queue$","^Forklift"]
}

# 실제 컬럼 맵핑
stage_cols: Dict[str, List[str]] = {
    "blanking": [], "press": [], "assembly": [], "warehouse": [], "paint": [], "quality": [], "forklift": []
}

# blanking은 SKU와 ALL을 구분해서 채운 뒤 전략 적용
blanking_sku_cols = pick_cols(STAGE_PATTERNS["blanking_sku"])
blanking_all_cols = pick_cols(STAGE_PATTERNS["blanking_all"])
if BLANKING_STRATEGY == "prefer_sku" and blanking_sku_cols:
    stage_cols["blanking"] = blanking_sku_cols
else:
    stage_cols["blanking"] = sorted(set(blanking_sku_cols + blanking_all_cols))

# 나머지 스테이지
for k in ["press","assembly","warehouse","paint","quality","forklift"]:
    stage_cols[k] = pick_cols(STAGE_PATTERNS[k])

# 스테이지별 대표값 frame 만들기
stage_rep = pd.DataFrame(index=df.index, columns=list(stage_cols.keys()), dtype=float)
for st, cols in stage_cols.items():
    if cols:
        vals = df[cols].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
        stage_rep[st] = vals.max(axis=1, numeric_only=True)
    else:
        stage_rep[st] = np.nan

# z-score 비교가 필요하면 스테이지별 정규화
if BN_SELECTION.lower() == "zscore":
    zrep = stage_rep.copy()
    for st in zrep.columns:
        mu = zrep[st].mean(skipna=True)
        sd = zrep[st].std(skipna=True)
        zrep[st] = (zrep[st] - mu) / (sd if (sd and sd != 0) else np.nan)
    comp_df = zrep
else:
    comp_df = stage_rep

# 행별 top1 스테이지/값 (전부 NaN 또는 <=0이면 None)
comp_filled = comp_df.fillna(-np.inf)               # 비교용(예: 여러 Queue 중 최대 찾기)
top_stage   = comp_filled.idxmax(axis=1)            # 행별 최대가 난 컬럼명

# lookup 대체: stage_rep에서 원시값(대표 Queue)을 안전하게 뽑기
# (전제: stage_rep의 index/columns 이 comp_df와 동일하거나 superset)
import numpy as np

row_idx = np.arange(len(stage_rep))
# top_stage(컬럼명)를 실제 열 인덱스로 변환(-1은 미존재)
col_idx = stage_rep.columns.get_indexer(top_stage)

vals = stage_rep.to_numpy().astype(float, copy=False)

# col_idx == -1(해당 열이 stage_rep에 없음) 대비
top_val = np.full(len(stage_rep), np.nan, dtype=float)
valid = col_idx >= 0
top_val[valid] = vals[row_idx[valid], col_idx[valid]]

# 안전 처리: 모두 NaN이었던 행 또는 비유효값(<=0, 비유한) → None/NaN
all_nan_mask = stage_rep.isna().all(axis=1)
nonpos_mask  = (~np.isfinite(top_val)) | (top_val <= 0)

bottleneck_stage = top_stage.mask(all_nan_mask | nonpos_mask, other="None")
bottleneck_value = pd.Series(top_val, index=stage_rep.index).mask(all_nan_mask | nonpos_mask, other=np.nan)

# 원본 df에 기록 (index 정렬 주의: 동일 index 기준)
df.loc[bottleneck_stage.index, "bottleneck_stage"] = bottleneck_stage
df.loc[bottleneck_value.index, "bottleneck_value"] = bottleneck_value

# 저장
df[["bottleneck_stage", "bottleneck_value"]].to_csv(
    os.path.join(OUT_DIR, "bottleneck_labels.csv"), index=False
)


# --------------------------
# 3) 병목별 그룹 통계
# --------------------------
stats = (
    df.groupby("bottleneck_stage")[TARGET]
      .agg(["mean","median","std","count"])
      .reset_index()
      .sort_values("mean", ascending=False)
)
overall_mean = float(np.nanmean(y_full))
stats["lift_vs_overall"] = stats["mean"] - overall_mean
stats.to_csv(os.path.join(OUT_DIR, "bottleneck_impact_summary.csv"), index=False)

# (옵션) 불량 지표 평균/중앙 병합
defect_cols = [c for c in df.columns if re.search(r"(defect|scrap|ng|fail|reject)", c, re.I)]
if defect_cols:
    d_mean = df.groupby("bottleneck_stage")[defect_cols].mean(numeric_only=True)
    d_mean.columns = [f"DEFECT_MEAN__{c}" for c in d_mean.columns]
    d_med  = df.groupby("bottleneck_stage")[defect_cols].median(numeric_only=True)
    d_med.columns  = [f"DEFECT_MEDIAN__{c}" for c in d_med.columns]
    summary_def = pd.concat([stats.set_index("bottleneck_stage"), d_mean, d_med], axis=1).reset_index()
    summary_def.to_csv(os.path.join(OUT_DIR, "bottleneck_impact_with_defects.csv"), index=False)

# 시각화
plt.figure(figsize=(10,4))
(df["bottleneck_stage"].value_counts().sort_values(ascending=False)).plot(kind="bar")
plt.title("Bottleneck Frequency (by Stage)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "plot_bottleneck_frequency.png"), dpi=150); plt.close()

plt.figure(figsize=(10,4))
plt.bar(stats["bottleneck_stage"], stats["mean"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mean Throughput")
plt.title("Mean Throughput by Bottleneck Stage")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "plot_mean_throughput_by_bottleneck.png"), dpi=150); plt.close()

# --------------------------
# 4) SKU별 리드타임 컬럼 수집
#    (SKU1~4 × {VA,NVA,Transport,Wait,Other}_Time)
# --------------------------
SKUS = [1,2,3,4]
TIME_KINDS = ["VA","NVA","Transport","Wait","Other"]

def find_leadtime_cols(df_cols) -> Dict[Tuple[int,str], List[str]]:
    """(sku, kind) -> 매칭된 리드타임 컬럼 목록"""
    mapping = {}
    for sku in SKUS:
        for kind in TIME_KINDS:
            # 예: "SKU1_VA_Time", "SKU1-VA Time", "sku1 va time" 등 다양한 표기 허용
            pat = re.compile(rf"\bSKU[_\- ]?{sku}\b.*\b{kind}\b.*\bTime\b", re.I)
            cols = [c for c in df_cols if pat.search(c)]
            mapping[(sku, kind)] = cols
    return mapping

lead_cols_map = find_leadtime_cols(df.columns)
lead_cols_flat = sorted({c for cols in lead_cols_map.values() for c in cols})
lead_df = df[lead_cols_flat].apply(pd.to_numeric, errors="coerce") if lead_cols_flat else pd.DataFrame(index=df.index)

# 보조 파생: SKU별 총 Time, 비율
ratio_cols = []
if not lead_df.empty:
    for sku in SKUS:
        per_sku_cols = {kind: lead_cols_map[(sku, kind)] for kind in TIME_KINDS}
        # kind별 합(여러 컬럼이 잡힌 경우 합산)
        sums = {}
        for kind, cols in per_sku_cols.items():
            if cols:
                sums[kind] = df[cols].apply(pd.to_numeric, errors="coerce").sum(axis=1)
            else:
                sums[kind] = pd.Series(np.nan, index=df.index)
        total = sum(sums.values())
        df[f"SKU{sku}_Total_Time"] = total
        # 비율 컬럼 추가
        for kind in TIME_KINDS:
            df[f"SKU{sku}_{kind}_Ratio"] = sums[kind] / total.replace({0: np.nan})
            ratio_cols.append(f"SKU{sku}_{kind}_Ratio")

# --------------------------
# 5) 모델 입력 구축
#    - 병목 스테이지 원핫 (BN_)
#    - 리드타임 원본 + 파생(총/비율) 포함
# --------------------------
# 병목 스테이지 원핫
bn_label = df["bottleneck_stage"].fillna("None").astype(str).to_frame()
try:
    ohe_bn = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe_bn = OneHotEncoder(handle_unknown="ignore", sparse=False)
bn_ohe_mat = ohe_bn.fit_transform(bn_label)
bn_ohe_cols = [f"BN_{c}" for c in ohe_bn.categories_[0]]
X_bn = pd.DataFrame(bn_ohe_mat, columns=bn_ohe_cols, index=df.index)

# 리드타임 피처(원본 + 파생)
X_lt_parts = []
if not lead_df.empty:
    X_lt_parts.append(lead_df)              # 원본 리드타임
if ratio_cols:
    X_lt_parts.append(df[ratio_cols])       # 비율
total_cols = [c for c in df.columns if re.fullmatch(r"SKU[1-4]_Total_Time", c)]
if total_cols:
    X_lt_parts.append(df[total_cols])       # 총합
X_lt = pd.concat(X_lt_parts, axis=1) if X_lt_parts else pd.DataFrame(index=df.index)

# 최종 설계행렬 2종: (A) 병목만, (B) 병목+리드타임
mask = ~y_full.isna()
y = y_full.loc[mask].reset_index(drop=True)
X_A = X_bn.loc[mask].reset_index(drop=True)
X_B = pd.concat([X_bn, X_lt], axis=1).loc[mask].reset_index(drop=True)

# 전처리 파이프 (수치만 존재하면 간단히 스케일만)
def make_preprocess(X: pd.DataFrame):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]
    num_tf = Pipeline([("imp", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
    if cat_cols:
        try:
            cat_ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            cat_ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        cat_tf = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", cat_ohe)])
        prep = ColumnTransformer([("num", num_tf, num_cols), ("cat", cat_tf, cat_cols)], remainder="drop")
    else:
        prep = ColumnTransformer([("num", num_tf, num_cols)], remainder="drop")
    return prep

pre_A = make_preprocess(X_A)
pre_B = make_preprocess(X_B)

# --------------------------
# 6) 회귀: A(병목만) vs B(병목+리드타임)
# --------------------------
def eval_regression(X: pd.DataFrame, y: pd.Series, tag: str):
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)
    pre = make_preprocess(X)

    ridge = Pipeline([("prep", pre), ("model", RidgeCV(alphas=np.logspace(-3,3,20)))])
    rf    = Pipeline([("prep", pre), ("model", RandomForestRegressor(n_estimators=400, random_state=RANDOM_SEED, n_jobs=-1))])

    results = {}
    for name, pipe in [("RidgeCV", ridge), ("RandomForest", rf)]:
        pipe.fit(X_tr, y_tr)
        pred = pipe.predict(X_te)
        out = {
            "MAE": float(mean_absolute_error(y_te, pred)),
            "RMSE": float(np.sqrt(mean_squared_error(y_te, pred))),
            "R2": float(r2_score(y_te, pred))
        }
        results[name] = out
        # 저장
        joblib.dump(pipe, os.path.join(OUT_DIR, f"model_{name}_{tag}.pkl"))
        pd.DataFrame({"y_true": y_te, "y_pred": pred}).to_csv(
            os.path.join(OUT_DIR, f"predictions_{name}_{tag}.csv"), index=False
        )
    # 메트릭 저장
    with open(os.path.join(OUT_DIR, f"regression_metrics_{tag}.json"), "w") as f:
        json.dump(results, f, indent=2)
    return results, (X_tr, X_te, y_tr, y_te), pre

res_A, split_A, preA = eval_regression(X_A, y, "BNonly")
res_B, split_B, preB = eval_regression(X_B, y, "BNplusLT")

print("[REG] A: BN only =>", res_A)
print("[REG] B: BN + LeadTime =>", res_B)

# --------------------------
# 7) 중요도/영향도: 리드타임 중심
#    - RandomForest permutation importance 대체: 트리 feature_importances_
#    - Ridge 계수(표준화 후) 절대값
# --------------------------
def export_feature_lists(pre: ColumnTransformer, X_cols: List[str]) -> List[str]:
    """ColumnTransformer를 통과한 최종 feature 이름을 최대한 추정"""
    names = []
    for name_, trans, cols in pre.transformers_:
        if name_ == "remainder" and trans == "drop":
            continue
        if hasattr(trans, "named_steps"):
            # 수치 파이프
            if "imp" in trans.named_steps and hasattr(trans.named_steps["imp"], "get_feature_names_out"):
                try:
                    names += list(trans.named_steps["imp"].get_feature_names_out(cols))
                    continue
                except Exception:
                    pass
            # 범주 + OHE
            if "ohe" in trans.named_steps and hasattr(trans.named_steps["ohe"], "get_feature_names_out"):
                try:
                    names += list(trans.named_steps["ohe"].get_feature_names_out(cols))
                    continue
                except Exception:
                    pass
        # fallback
        if isinstance(cols, list):
            names += list(cols)
        else:
            names += [cols]
    return names

def rf_importance(pipe: Pipeline, out_csv: str, top_png: str, top_k:int=TOP_K_IMP):
    rf = pipe.named_steps.get("model")
    pre = pipe.named_steps.get("prep")
    if not hasattr(rf, "feature_importances_"):
        return None
    feat_names = export_feature_lists(pre, [])
    imp = pd.Series(rf.feature_importances_, index=feat_names).sort_values(ascending=False)
    imp.to_csv(out_csv, header=["importance"])
    # 그래프
    top = imp.head(top_k)
    plt.figure(figsize=(8, max(3, 0.4*len(top))))
    plt.barh(top.index[::-1], top.values[::-1])
    plt.title("RandomForest Feature Importances (Top {})".format(top_k))
    plt.tight_layout()
    plt.savefig(top_png, dpi=150); plt.close()
    return imp

def ridge_coeff(pipe: Pipeline, out_csv: str):
    # 표준화 이후 계수 → 절대값이 클수록 영향 큼
    model = pipe.named_steps.get("model")
    pre   = pipe.named_steps.get("prep")
    if not hasattr(model, "coef_"):
        return None
    feat_names = export_feature_lists(pre, [])
    coefs = pd.Series(np.ravel(model.coef_), index=feat_names)
    coefs.to_csv(out_csv, header=["coef"])
    return coefs

# BN+LT 모델 중요도 내보내기
ridgeB = joblib.load(os.path.join(OUT_DIR, "model_RidgeCV_BNplusLT.pkl"))
rfB    = joblib.load(os.path.join(OUT_DIR, "model_RandomForest_BNplusLT.pkl"))

imp_rf_B = rf_importance(
    rfB,
    os.path.join(OUT_DIR, "feature_importance_RF_BNplusLT.csv"),
    os.path.join(OUT_DIR, "plot_feature_importance_RF_BNplusLT_top{}.png".format(TOP_K_IMP)),
    top_k=TOP_K_IMP
)
coef_ridge_B = ridge_coeff(
    ridgeB,
    os.path.join(OUT_DIR, "ridge_coeff_BNplusLT.csv")
)

# 리드타임 관련 항목만 필터링해서 별도 저장/힛맵
def is_lead_feature(name: str) -> bool:
    return bool(re.search(r"(SKU[1-4].*(VA|NVA|Transport|Wait|Other).*Time)|(_Ratio$)|(_Total_Time$)", name, re.I))

if imp_rf_B is not None:
    imp_lt = imp_rf_B[imp_rf_B.index.map(is_lead_feature)]
    imp_lt.to_csv(os.path.join(OUT_DIR, "feature_importance_RF__LeadTime_only.csv"), header=["importance"])

    # SKU × Kind 피벗(합산 중요도)
    rows = []
    for feat, val in imp_lt.items():
        m = re.search(r"(SKU[1-4]).*(VA|NVA|Transport|Wait|Other)", feat, re.I)
        if m:
            sku = m.group(1).upper()
            kind = m.group(2).capitalize()
            rows.append((sku, kind, float(val)))
    if rows:
        lt_imp_df = pd.DataFrame(rows, columns=["SKU","Kind","importance"])
        pivot = lt_imp_df.pivot_table(index="SKU", columns="Kind", values="importance", aggfunc="sum").fillna(0.0)
        pivot.to_csv(os.path.join(OUT_DIR, "leadtime_importance_pivot_RF.csv"))

        # 히트맵(단색 막대 대체)
        plt.figure(figsize=(7,4))
        # 간단 heatmap 구현 (matplotlib)
        im = plt.imshow(pivot.values, aspect="auto")
        plt.xticks(ticks=np.arange(len(pivot.columns)), labels=pivot.columns, rotation=45, ha="right")
        plt.yticks(ticks=np.arange(len(pivot.index)), labels=pivot.index)
        plt.colorbar(im, fraction=0.046, pad=0.04)
        plt.title("Lead-time Importance (RF, summed)")
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR, "plot_leadtime_importance_RF_heatmap.png"), dpi=150); plt.close()

# 상관 분석(보조 인사이트)
if not X_lt.empty:
    corr_rows = []
    for c in X_lt.columns:
        x = pd.to_numeric(X_lt[c], errors="coerce")
        m = (~x.isna()) & (~y_full.isna())
        if m.sum() > 3:
            r = np.corrcoef(x[m], y_full[m])[0,1]
        else:
            r = np.nan
        corr_rows.append({"feature": c, "pearson_r": r})
    pd.DataFrame(corr_rows).sort_values("pearson_r", ascending=False)\
        .to_csv(os.path.join(OUT_DIR, "leadtime_vs_throughput_corr.csv"), index=False)

# --------------------------
# 8) (옵션) 분류: 상위 10% 생산량 여부
# --------------------------
q90 = float(np.nanpercentile(y, 90))
y_bin = (y >= q90).astype(int)
X_trB, X_teB, y_trB, y_teB = train_test_split(X_B, y_bin, test_size=TEST_SIZE, random_state=RANDOM_SEED)

pre_cls = make_preprocess(X_B)
logreg = Pipeline([("prep", pre_cls), ("clf", LogisticRegression(max_iter=300, class_weight="balanced"))])
logreg.fit(X_trB, y_trB)
proba = logreg.predict_proba(X_teB)[:,1]
auc = roc_auc_score(y_teB, proba)
report = classification_report(y_teB, (proba>=0.5).astype(int), output_dict=True)
with open(os.path.join(OUT_DIR, "classification_report_top10pct_BNplusLT.json"), "w") as f:
    json.dump({"ROC_AUC": auc, "report": report}, f, indent=2)
print(f"[CLS] BN+LT Top10% ROC-AUC={auc:.3f}")

# --------------------------
# 9) 저장물 안내
# --------------------------
print("\n[Saved files @ {}]".format(OUT_DIR))
print(" - bottleneck_labels.csv")
print(" - bottleneck_impact_summary.csv", "(+ bottleneck_impact_with_defects.csv if available)")
print(" - plot_bottleneck_frequency.png")
print(" - plot_mean_throughput_by_bottleneck.png")
print(" - regression_metrics_BNonly.json (Ridge/RF)")
print(" - regression_metrics_BNplusLT.json (Ridge/RF)")
print(" - model_RidgeCV_BNonly.pkl / predictions_RidgeCV_BNonly.csv")
print(" - model_RandomForest_BNonly.pkl / predictions_RandomForest_BNonly.csv")
print(" - model_RidgeCV_BNplusLT.pkl / predictions_RidgeCV_BNplusLT.csv")
print(" - model_RandomForest_BNplusLT.pkl / predictions_RandomForest_BNplusLT.csv")
print(" - feature_importance_RF_BNplusLT.csv")
print(" - plot_feature_importance_RF_BNplusLT_top{}.png".format(TOP_K_IMP))
print(" - ridge_coeff_BNplusLT.csv")
print(" - feature_importance_RF__LeadTime_only.csv (if any)")
print(" - leadtime_importance_pivot_RF.csv / plot_leadtime_importance_RF_heatmap.png (if any)")
print(" - leadtime_vs_throughput_corr.csv (if any)")
print(" - classification_report_top10pct_BNplusLT.json")

[REG] A: BN only => {'RidgeCV': {'MAE': 880.413304862572, 'RMSE': 1091.3276455144644, 'R2': 0.01708804409174336}, 'RandomForest': {'MAE': 880.4143148294148, 'RMSE': 1091.3277300986306, 'R2': 0.017087891729083537}}
[REG] B: BN + LeadTime => {'RidgeCV': {'MAE': 880.413304862572, 'RMSE': 1091.3276455144644, 'R2': 0.01708804409174336}, 'RandomForest': {'MAE': 880.4143148294147, 'RMSE': 1091.3277300986306, 'R2': 0.017087891729083537}}
[CLS] BN+LT Top10% ROC-AUC=0.553

[Saved files @ outputs\bottleneck]
 - bottleneck_labels.csv
 - bottleneck_impact_summary.csv (+ bottleneck_impact_with_defects.csv if available)
 - plot_bottleneck_frequency.png
 - plot_mean_throughput_by_bottleneck.png
 - regression_metrics_BNonly.json (Ridge/RF)
 - regression_metrics_BNplusLT.json (Ridge/RF)
 - model_RidgeCV_BNonly.pkl / predictions_RidgeCV_BNonly.csv
 - model_RandomForest_BNonly.pkl / predictions_RandomForest_BNonly.csv
 - model_RidgeCV_BNplusLT.pkl / predictions_RidgeCV_BNplusLT.csv
 - model_RandomForest_B