In [1]:
# ============================================
# 0) 드롭인 패치 셀 (경량 모드)
# ============================================
# 본 셀은 1번 셀 아래에 두고 실행. 이후 코드는 그대로 둬도 경량 모드로 동작.
import os, re, gc, json, math, warnings
import numpy as np
import pandas as pd

# ---- 글로벌 스위치 ----
SPEED_MODE         = True
ENABLE_PLOTS       = False
ROW_SAMPLE_MAX     = 250_000
COL_KEEP_PAT       = r"(?:_Queue|_Util|^c_Cell|^c_TotalProducts$|^SKU\d+_(?:VA|Wait|Transport|LeadTime|Total))"
TOP_SKU_K          = 3
MAX_BNECK_ONEHOT   = 20
ELAS_FEATURES_TOPK = 8
ELAS_SAMPLE_N      = 5000
USE_PDP            = False
USE_FAST_FD        = True
SKIP_PERM_IMPORT   = True
RIDGE_TEST_SIZE    = 0.25
RF_ESTIMATORS      = 60
RF_MAX_DEPTH       = 12
RF_NJOBS           = -1
USE_HGBR_FIRST     = True
SKIP_DEFECT_BLOCK  = True
INCLUDE_MUSD_IN_LT = False
INCLUDE_MUSD_IN_PR = False
WHATIF_SCENARIOS   = {"Blanking_Queue": -50.0, "Warehouse1_Queue": -50.0, "Quality_Util": +5.0}

def _speed_log(*a):
    if SPEED_MODE: print(*a)

def savefig(path: str):
    # 경량 저장: show 없이 저장만, 닫고 GC
    if not ENABLE_PLOTS:
        return
    import matplotlib.pyplot as plt
    try:
        plt.tight_layout()
    except Exception:
        pass
    plt.savefig(path, bbox_inches="tight", dpi=140)
    plt.close()
    gc.collect()

def to_numeric_df(d: pd.DataFrame) -> pd.DataFrame:
    # float32 일괄 변환 + NA 채움(경량)
    for c in d.columns:
        d.loc[:, c] = pd.to_numeric(d[c], errors="coerce", downcast="float")
    d = d.fillna(0.0)
    d = d.infer_objects(copy=False)
    return d

def _apply_df_speed_filters(df: pd.DataFrame) -> pd.DataFrame:
    if not SPEED_MODE: return df
    # 열 필터
    if COL_KEEP_PAT:
        keep_re = re.compile(COL_KEEP_PAT, re.IGNORECASE)
        keep_cols = [c for c in df.columns if keep_re.search(c)]
        for must in ["Time_Now"]:
            if must in df.columns and must not in keep_cols:
                keep_cols.append(must)
        df = df[keep_cols]
        _speed_log(f"[SPEED] kept {len(keep_cols)} columns by pattern.")
    # 행 샘플링
    if ROW_SAMPLE_MAX and len(df) > ROW_SAMPLE_MAX:
        df = df.sample(ROW_SAMPLE_MAX, random_state=42).sort_index()
        _speed_log(f"[SPEED] row-sampled to {ROW_SAMPLE_MAX} rows.")
    # dtype
    df = to_numeric_df(df)
    gc.collect()
    return df

# ---- 모델 공통 ----
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def metrics_dict(y_true, y_pred) -> dict:
    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))
    return {"MAE": round(mae,3), "RMSE": round(rmse,3), "R2": round(r2,4)}

def train_ridge(X, y, alpha=1.0, random_state=42, test_size=None):
    from sklearn.linear_model import Ridge
    if test_size is None:
        test_size = RIDGE_TEST_SIZE if SPEED_MODE else 0.3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    reg = Ridge(alpha=alpha, random_state=random_state)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    m = metrics_dict(y_test, y_pred)
    coef = pd.Series(reg.coef_, index=X.columns).sort_values(ascending=False)
    std_x = X_test.std(axis=0).replace(0, np.nan)
    std_y = np.std(y_test) if np.std(y_test) > 0 else np.nan
    beta_std = (coef * (std_x / std_y)).sort_values(ascending=False)
    return reg, (X_train, X_test, y_train, y_test), m, coef, beta_std

def train_fast_tree(X, y, random_state=42):
    """HistGradientBoostingRegressor 우선, 실패 시 경량 RF 폴백."""
    try:
        if USE_HGBR_FIRST:
            from sklearn.experimental import enable_hist_gradient_boosting  # noqa
            from sklearn.ensemble import HistGradientBoostingRegressor
            model = HistGradientBoostingRegressor(
                max_depth=8, learning_rate=0.1, max_iter=150, random_state=random_state
            )
            model.fit(X, y)
            score = model.score(X, y)
            _speed_log("[SPEED] Using HistGradientBoostingRegressor.")
            return model, None, score, None, None
    except Exception as e:
        _speed_log("[SPEED] HGBR unavailable → fallback to RF.", e)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(
        n_estimators=RF_ESTIMATORS, max_depth=RF_MAX_DEPTH,
        n_jobs=RF_NJOBS, random_state=random_state
    )
    rf.fit(X, y)
    fi = pd.Series(rf.feature_importances_, index=X.columns)
    pi = None
    if not SKIP_PERM_IMPORT:
        from sklearn.inspection import permutation_importance
        pi_ = permutation_importance(rf, X, y, n_repeats=3, random_state=random_state, n_jobs=RF_NJOBS)
        pi = pd.Series(pi_.importances_mean, index=X.columns)
    score = rf.score(X, y)
    return rf, None, score, fi, pi

# 기존 train_rf 덮어쓰기
def train_rf(X, y, n_repeats=0, random_state=42):
    return train_fast_tree(X, y, random_state=random_state)

# ---- 피처/병목 One-Hot 제한 ----
#time_like_regex = re.compile(r"(?:_Time|_sec|_LeadTime|_Total)", re.IGNORECASE)
time_like_regex = re.compile(r'(?:_Time|_sec|_LeadTime|_Total|_VA_(?:dev|z))', re.IGNORECASE)

def build_features_for_leadtime(df_, exclude_cols, add_bneck_onehot=True, top_bneck_list=None, max_bneck=None):
    if max_bneck is None:
        max_bneck = MAX_BNECK_ONEHOT
    cols = [c for c in df_.columns if c not in exclude_cols and c != "Time_Now" and not time_like_regex.search(c)]
    X = to_numeric_df(df_[cols]) if cols else pd.DataFrame(index=df_.index)
    if add_bneck_onehot and "_bneck" in df_.columns:
        b_series = df_["_bneck"].astype(str)
        if top_bneck_list is not None:
            b_series = b_series.where(b_series.isin(top_bneck_list), "Other")
        topN = b_series.value_counts().nlargest(max_bneck).index
        b_series = b_series.where(b_series.isin(topN), "Other")
        D = pd.get_dummies(b_series, prefix="BNECK", dtype="float32")
        X = pd.concat([X, D], axis=1)
    return X

def build_features_for_production(df_, exclude_cols):
    cols = [c for c in df_.columns if c not in exclude_cols and c != "Time_Now" and not c.startswith("c_Cell") and c != "c_TotalProducts"]
    return to_numeric_df(df_[cols]) if cols else pd.DataFrame(index=df_.index)

# ---- 탄력도(FD) 파라미터 ----
FEATURES_TOP_K = ELAS_FEATURES_TOPK
ONLY_QUEUE_UTIL = True
SAMPLE_N = ELAS_SAMPLE_N
USE_FDP_FAST = USE_FAST_FD

def limit_top_k_skus(sku_ids, sku_targets, k=TOP_SKU_K):
    if not SPEED_MODE or not k or k >= len(sku_ids):
        return sku_ids
    sums = []
    for sid in sku_ids:
        pr = sku_targets[sid]["production"]
        sums.append((sid, float(np.nansum(pr.values)) if hasattr(pr, "values") else float(np.nansum(pr))))
    sums = sorted(sums, key=lambda x: x[1], reverse=True)
    top = [sid for sid, _ in sums[:k]]
    _speed_log(f"[SPEED] limit SKUs → top {k}: {top}")
    return top

def get_scenarios(X_cols):
    sc = {}
    for k, v in WHATIF_SCENARIOS.items():
        if k in X_cols:
            sc[k] = v
    return sc

def should_skip_defect_block(): return SPEED_MODE and SKIP_DEFECT_BLOCK
def musd_flags():
    return (INCLUDE_MUSD_IN_LT if not SPEED_MODE else False), (INCLUDE_MUSD_IN_PR if not SPEED_MODE else False)

In [2]:
# ============================================
# 1) 설정 · 로드 · 공통 유틸
# ============================================
# -*- coding: utf-8 -*-
import os, re, json, math, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Dict, List, Tuple, Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge

CSV_PATH = "./Final Results Extended.csv"   # 필요시 변경
OUT_DIR  = "./outputs"
os.makedirs(OUT_DIR, exist_ok=True)

pd.set_option("display.max_columns", 160)
pd.set_option("display.width", 160)

# ---- 데이터 로드 ----
df = pd.read_csv(CSV_PATH, low_memory=False)
df = _apply_df_speed_filters(df)  # ⬅️ 경량 필터 적용
print("=== Dataset shape ===", df.shape)
display(df.head(3))

def topk(s: pd.Series, k=20):
    return s.head(k) if len(s) > k else s

def ensure_dir(path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    return path

[SPEED] kept 66 columns by pattern.
=== Dataset shape === (132676, 66)


Unnamed: 0,Blanking_Util,Blanking_SKU1_Queue,Blanking_SKU2_Queue,Blanking_SKU3_Queue,Blanking_SKU4_Queue,Press1_Util,Press2_Util,Press3_Util,Press4_Util,Press1_Queue,Press2_Queue,Press3_Queue,Press4_Queue,Cell1_Util,Cell2_Util,Cell3_Util,Cell4_Util,Cell1_Queue,Cell2_Queue,Cell3_Queue,Cell4_Queue,Warehouse1_Queue,Warehouse_2_Queue,Warehouse_3_Queue,Warehouse_4_Queue,c_Cell1_SKU1,c_Cell1__SKU2,c_Cell1__SKU3,c_Cell1__SKU4,c_Cell2__SKU1,c_Cell2__SKU2,c_Cell2__SKU3,c_Cell2__SKU4,c_Cell3__SKU1,c_Cell3__SKU2,c_Cell3__SKU3,c_Cell3__SKU4,c_Cell4__SKU1,c_Cell4__SKU2,c_Cell4__SKU3,c_Cell4__SKU4,Paint1_Util,Paint2_Util,Quality_Util,Paint1_Queue,Paint2_Queue,Quality_Queue,Forklift_Util,Forklift_Blanking_Queue,Forklift_Press_Queue,Forklift_Assembly_Queue,c_TotalProducts,SKU1_VA_Time,SKU1_Transport_Time,SKU1_Wait_Time,SKU2_VA_Time,SKU2_Transport_Time,SKU2_Wait_Time,SKU3_VA_Time,SKU3_Transport_Time,SKU3_Wait_Time,SKU4_VA_Time,SKU4_Transport_Time,SKU4_Wait_Time,Blanking_Queue,Time_Now
0,0.846367,0.045715,0.056373,0.055737,0.035849,0.410297,0.434561,0.481388,0.399992,56.540291,65.273788,52.970764,47.964825,0.84887,0.673814,0.705012,0.819405,0.0,0.0,0.0,0.0,98.155571,19.580276,77.580864,75.124985,13967.0,10867.0,0.0,4382.0,0.0,2851.0,0.0,4334.0,0.0,903.0,4707.0,0.0,0.0,0.0,11743.0,4934.0,0.472565,0.504126,0.432738,0.0,0.0,47.361618,0.606002,157.256744,142.123962,137.454849,54377.0,1.525651,0.53681,0.587305,1.522853,0.532394,0.441645,1.525071,0.537617,0.45365,1.523338,0.536243,0.473453,58.36145,24
1,0.851097,0.051937,0.052934,0.038512,0.042248,0.455471,0.454445,0.387975,0.442986,74.77282,52.530064,49.303143,59.119987,0.87675,0.660415,0.629526,0.803448,0.0,0.0,0.0,0.0,162.63237,18.164795,64.827576,69.374908,15071.0,11180.0,0.0,3615.0,0.0,2932.0,0.0,4125.0,0.0,1346.0,3852.0,0.0,0.0,0.0,9556.0,7484.0,0.482683,0.501912,0.436583,0.0,0.0,48.266911,0.599976,166.488724,142.113663,140.757584,54857.0,1.525665,0.534195,0.684177,1.522836,0.535767,0.401993,1.524986,0.536764,0.473677,1.523344,0.534992,0.46438,62.830601,24
2,0.846115,0.05221,0.047499,0.043181,0.040979,0.496717,0.450816,0.417308,0.352829,67.283035,72.90107,39.198517,33.994083,0.895845,0.660288,0.654895,0.766823,0.0,0.0,0.0,0.0,335.40155,20.369415,46.12011,71.720192,16603.0,11042.0,0.0,2260.0,0.0,2904.0,0.0,4149.0,0.0,1198.0,4139.0,0.0,0.0,0.0,10282.0,5575.0,0.476897,0.492906,0.429997,0.0,0.0,48.906418,0.576669,159.188507,148.427689,142.653366,54052.0,1.525644,0.535403,0.898331,1.522821,0.535631,0.427815,1.525064,0.535925,0.42409,1.523403,0.535077,0.47533,59.365868,24


In [3]:
# ============================================
# 2) 컬럼 인식 · SKU 타깃 · 통합 리드타임(메모리 안전)
# ============================================
queue_cols = [c for c in df.columns if c.endswith("_Queue")]
util_cols  = [c for c in df.columns if c.endswith("_Util")]
cycle_cols = [c for c in df.columns if c.startswith("c_Cycle")]
cell_cols  = [c for c in df.columns if c.startswith("c_Cell")]

print("\n=== Column groups ===")
print(f"Queue cols  ({len(queue_cols)}): {queue_cols[:10]}{' ...' if len(queue_cols)>10 else ''}")
print(f"Util cols   ({len(util_cols)}):  {util_cols[:10]}{' ...' if len(util_cols)>10 else ''}")
print(f"Cycle cols  ({len(cycle_cols)}): {cycle_cols[:10]}")
print(f"Cell cols   ({len(cell_cols)}):  {cell_cols[:10]}{' ...' if len(cell_cols)>10 else ''}")

# === SKU 자동 탐지 ===
sku_ids = sorted({re.findall(r"SKU(\d+)_", c)[0] for c in df.columns if re.findall(r"SKU(\d+)_", c)})
print("\n=== Detected SKUs ===", sku_ids)

# 리드타임 컬럼 후보
LT_PATTERNS = dict(
    VA=["_VA_Time_sec", "_VA_Time", "_VA"],
    WAIT=["_Wait_Time_sec", "_Wait_Time", "_Wait"],
    LIFT=["_Transport_Time_sec", "_Transport_Time", "_Transport", "_Lift"],
    TOTAL=["_Total_sec", "_LeadTime_sec", "_LeadTime"]
)
def find_first_existing(base: str, suffixes: List[str]) -> Optional[str]:
    for s in suffixes:
        col = base + s
        if col in df.columns:
            return col
    return None

# === SKU별 리드타임/생산량 타깃 ===
sku_targets = {}
for sid in sku_ids:
    base = f"SKU{sid}"
    col_VA   = find_first_existing(base, LT_PATTERNS["VA"])
    col_WAIT = find_first_existing(base, LT_PATTERNS["WAIT"])
    col_LIFT = find_first_existing(base, LT_PATTERNS["LIFT"])
    col_TOT  = find_first_existing(base, LT_PATTERNS["TOTAL"])

    # 리드타임 타깃
    if col_TOT:
        lt_series = pd.to_numeric(df[col_TOT], errors="coerce").fillna(0.0).astype("float32")
        lt_name, lt_method = col_TOT, "TOTAL_col"
    else:
        VA   = pd.to_numeric(df[col_VA],   errors="coerce").fillna(0.0).astype("float32") if col_VA   else 0.0
        WAIT = pd.to_numeric(df[col_WAIT], errors="coerce").fillna(0.0).astype("float32") if col_WAIT else 0.0
        LIFT = pd.to_numeric(df[col_LIFT], errors="coerce").fillna(0.0).astype("float32") if col_LIFT else 0.0
        lt_series = pd.Series(VA, index=df.index) + pd.Series(WAIT, index=df.index) + pd.Series(LIFT, index=df.index)
        lt_series = lt_series.astype("float32")
        lt_name, lt_method = f"{base}_LeadTime_composed", "VA+WAIT+LIFT"

    # 생산량 타깃: 해당 SKU의 셀 출력 합
    cell_cols_sku = [c for c in cell_cols if re.search(fr"__SKU{sid}\b", c)]
    if cell_cols_sku:
        prod_series = to_numeric_df(df[cell_cols_sku]).sum(axis=1).astype("float32")
        prod_name   = f"Total_SKU{sid}"
    else:
        prod_series = pd.Series(0.0, index=df.index, dtype="float32")
        prod_name   = f"Total_SKU{sid}_zeros"

    sku_targets[sid] = dict(
        lead_time=lt_series, lead_time_name=lt_name, lead_time_method=lt_method,
        production=prod_series, production_name=prod_name, cell_cols=cell_cols_sku
    )

print("\n=== SKU targets overview (first 2) ===")
for sid in sku_ids[:2]:
    print(f"SKU{sid} LT: {sku_targets[sid]['lead_time_name']} ({sku_targets[sid]['lead_time_method']}), "
          f"PROD: {sku_targets[sid]['production_name']} | #cell_cols={len(sku_targets[sid]['cell_cols'])}")

# === 통합 타깃 ===
# 생산량(통합)
if "c_TotalProducts" in df.columns:
    total_prod = pd.to_numeric(df["c_TotalProducts"], errors="coerce").fillna(0.0).astype("float32")
else:
    total_prod = to_numeric_df(df[cell_cols]).sum(axis=1).astype("float32")

# 리드타임(생산량 가중평균, 메모리 안전)
N = len(df)
numerator = np.zeros(N, dtype=np.float32)  # Σ(lt_k * prod_k)
weights   = np.zeros(N, dtype=np.float32)  # Σ(prod_k)
sum_lt    = np.zeros(N, dtype=np.float32)  # fallback 평균 분자
k_count   = 0

for sid in sku_ids:
    lt = sku_targets[sid]["lead_time"].values
    pr = sku_targets[sid]["production"].values
    numerator += lt * pr
    weights   += pr
    sum_lt    += lt
    k_count   += 1

fallback_mean = (sum_lt / max(k_count, 1)).astype("float32")
with np.errstate(divide='ignore', invalid='ignore'):
    weighted_lt = np.where(weights > 0.0, numerator / weights, fallback_mean)
agg_lead_time = pd.Series(weighted_lt, index=df.index, name="Agg_LeadTime_weighted").astype("float32")

del numerator, weights, sum_lt; gc.collect()


=== Column groups ===
Queue cols  (23): ['Blanking_SKU1_Queue', 'Blanking_SKU2_Queue', 'Blanking_SKU3_Queue', 'Blanking_SKU4_Queue', 'Press1_Queue', 'Press2_Queue', 'Press3_Queue', 'Press4_Queue', 'Cell1_Queue', 'Cell2_Queue'] ...
Util cols   (13):  ['Blanking_Util', 'Press1_Util', 'Press2_Util', 'Press3_Util', 'Press4_Util', 'Cell1_Util', 'Cell2_Util', 'Cell3_Util', 'Cell4_Util', 'Paint1_Util'] ...
Cycle cols  (0): []
Cell cols   (16):  ['c_Cell1_SKU1', 'c_Cell1__SKU2', 'c_Cell1__SKU3', 'c_Cell1__SKU4', 'c_Cell2__SKU1', 'c_Cell2__SKU2', 'c_Cell2__SKU3', 'c_Cell2__SKU4', 'c_Cell3__SKU1', 'c_Cell3__SKU2'] ...

=== Detected SKUs === ['1', '2', '3', '4']

=== SKU targets overview (first 2) ===
SKU1 LT: SKU1_LeadTime_composed (VA+WAIT+LIFT), PROD: Total_SKU1 | #cell_cols=3
SKU2 LT: SKU2_LeadTime_composed (VA+WAIT+LIFT), PROD: Total_SKU2 | #cell_cols=4


0

In [4]:
# ============================================
# 3) 병목 라벨링 · 요약표(전체/통합) · SKU×병목 연계표
# ============================================
Q = to_numeric_df(df[[c for c in df.columns if c.endswith("_Queue")]])
if Q.shape[1] == 0:
    raise ValueError("Queue 계열 컬럼이 없습니다.")
df["_bneck"] = Q.idxmax(axis=1)  # 각 행 최대 Queue의 컬럼명

def compute_bneck_stats(target: pd.Series, name: str, out_csv: str):
    tmp = pd.DataFrame({"bneck": df["_bneck"].astype("category"), "val": target.astype("float32")})
    stats = tmp.groupby("bneck", observed=False).agg(
        top1_count=("bneck", "count"),
        mean_target=("val", "mean"),
        std_target=("val", "std")
    ).sort_values("top1_count", ascending=False).reset_index()
    stats["target_name"] = name
    stats.to_csv(ensure_dir(os.path.join(OUT_DIR, out_csv)), index=False)
    return stats

bneck_total_prod = compute_bneck_stats(total_prod, "TotalProduction", "bottleneck_totalProduction.csv")
bneck_agg_lt     = compute_bneck_stats(agg_lead_time, "AggLeadTime", "bottleneck_aggLeadTime.csv")

print("\n=== 생산량 기준 병목 빈도 ===")
display(bneck_total_prod.head(10))

print("\n=== 리드타임 기준 병목 빈도 ===")
display(bneck_agg_lt.head(10))

TOP_BNECKS = list(bneck_total_prod["bneck"].head(8).values)  # 빈도 상위 N
for sid in sku_ids:
    lt = sku_targets[sid]["lead_time"].astype("float32")
    tmp = pd.DataFrame({"bneck": df["_bneck"], "lt": lt})
    tmp["bneck_top"] = np.where(tmp["bneck"].isin(TOP_BNECKS), tmp["bneck"], "Other")
    grp = tmp.groupby("bneck_top")["lt"].agg(["count","mean","std"]).sort_values("count", ascending=False)
    grp.to_csv(ensure_dir(os.path.join(OUT_DIR, f"sku_bneck_effect_SKU{sid}.csv")))


=== 생산량 기준 병목 빈도 ===


Unnamed: 0,bneck,top1_count,mean_target,std_target,target_name
0,Warehouse1_Queue,77631,54858.582031,1095.462769,TotalProduction
1,Forklift_Blanking_Queue,53681,54578.960938,1087.814575,TotalProduction
2,Warehouse_3_Queue,1307,55135.089844,1061.024292,TotalProduction
3,Warehouse_4_Queue,36,56690.417969,646.384399,TotalProduction
4,Forklift_Press_Queue,9,54510.332031,932.098145,TotalProduction
5,Press4_Queue,6,55440.832031,665.971619,TotalProduction
6,Press2_Queue,5,55408.398438,477.534607,TotalProduction
7,Blanking_SKU3_Queue,1,0.0,,TotalProduction



=== 리드타임 기준 병목 빈도 ===


Unnamed: 0,bneck,top1_count,mean_target,std_target,target_name
0,Warehouse1_Queue,77631,2.513333,0.023924,AggLeadTime
1,Forklift_Blanking_Queue,53681,2.505023,0.017277,AggLeadTime
2,Warehouse_3_Queue,1307,2.577764,0.035883,AggLeadTime
3,Warehouse_4_Queue,36,2.56867,0.025048,AggLeadTime
4,Forklift_Press_Queue,9,2.501076,0.016981,AggLeadTime
5,Press4_Queue,6,2.546885,0.043715,AggLeadTime
6,Press2_Queue,5,2.540109,0.018334,AggLeadTime
7,Blanking_SKU3_Queue,1,0.0,,AggLeadTime


In [5]:
pd.set_option('future.no_silent_downcasting', True)

In [6]:
# ============================================
# 4) 피처 구성(누수 방지 + 병목 One-Hot) · 모델 학습(RF & Ridge)
# ============================================
import matplotlib.pyplot as plt

# (중요) SKU 루프 제한: 생산량 상위 K개만
sku_ids = limit_top_k_skus(sku_ids, sku_targets, k=TOP_SKU_K)

# === 통합(AGG) 모델 ===
# LeadTime(AGG)
X_lt_agg = build_features_for_leadtime(df, exclude_cols=[], add_bneck_onehot=True, top_bneck_list=TOP_BNECKS)
y_lt_agg = agg_lead_time.values
rf_lt_agg, splits_lt_agg, m_rf_lt_agg, fi_lt_agg, pi_lt_agg = train_rf(X_lt_agg, y_lt_agg)
rg_lt_agg, splits_rg_lt_agg, m_rg_lt_agg, coef_lt_agg, beta_std_lt_agg = train_ridge(X_lt_agg, y_lt_agg)

# 👍 탄력도 5번 셀에서 쓰기 위한 '학습 시 피처 리스트' 저장 (삭제 전에!)
FEATS_AG_LT = X_lt_agg.columns.tolist()

if fi_lt_agg is not None:
    fi_lt_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "fi_leadtime_AGG.csv")))
if pi_lt_agg is not None:
    pi_lt_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "pi_leadtime_AGG.csv")))
coef_lt_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "ridge_coef_leadtime_AGG.csv")))
beta_std_lt_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "ridge_betaStd_leadtime_AGG.csv")))
if ENABLE_PLOTS and fi_lt_agg is not None:
    plt.figure(figsize=(8,6)); topk(fi_lt_agg,20).iloc[::-1].plot(kind="barh")
    plt.title("RF FI (LeadTime) - AGG"); savefig(os.path.join(OUT_DIR,"fi_leadtime_AGG_top20.png"))

del X_lt_agg; gc.collect()

# Production(AGG)
X_pr_agg = build_features_for_production(df, exclude_cols=[])
y_pr_agg = total_prod.values
rf_pr_agg, splits_pr_agg, m_rf_pr_agg, fi_pr_agg, pi_pr_agg = train_rf(X_pr_agg, y_pr_agg)
rg_pr_agg, splits_rg_pr_agg, m_rg_pr_agg, coef_pr_agg, beta_std_pr_agg = train_ridge(X_pr_agg, y_pr_agg)

# 👍 탄력도 5번 셀에서 쓰기 위한 '학습 시 피처 리스트' 저장 (삭제 전에!)
FEATS_AG_PR = X_pr_agg.columns.tolist()

if fi_pr_agg is not None:
    fi_pr_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "fi_production_AGG.csv")))
if pi_pr_agg is not None:
    pi_pr_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "pi_production_AGG.csv")))
coef_pr_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "ridge_coef_production_AGG.csv")))
beta_std_pr_agg.to_csv(ensure_dir(os.path.join(OUT_DIR, "ridge_betaStd_production_AGG.csv")))
if ENABLE_PLOTS and fi_pr_agg is not None:
    plt.figure(figsize=(8,6)); topk(fi_pr_agg,20).iloc[::-1].plot(kind="barh")
    plt.title("RF FI (Production) - AGG"); savefig(os.path.join(OUT_DIR, "fi_production_AGG_top20.png"))

del X_pr_agg; gc.collect()

print("\n=== AGG Metrics ===")
print({"RF_LT": m_rf_lt_agg, "RG_LT": m_rg_lt_agg, "RF_PR": m_rf_pr_agg, "RG_PR": m_rg_pr_agg})

# === SKU별 모델 ===
per_sku_results = {}
for sid in sku_ids:
    # LeadTime
    y_lt = sku_targets[sid]["lead_time"].values
    exclude_lt = set(sku_targets[sid]["cell_cols"])
    exclude_lt |= {c for c in df.columns if c.startswith(f"SKU{sid}_") and time_like_regex.search(c)}
    X_lt = build_features_for_leadtime(df, list(exclude_lt), add_bneck_onehot=True, top_bneck_list=TOP_BNECKS)

    lead_pack = None
    if X_lt.shape[1] > 0:
        rf_lt, spl_lt, m_rf_lt, fi_lt, pi_lt = train_rf(X_lt, y_lt)
        rg_lt, spl_rg_lt, m_rg_lt, coef_lt, beta_std_lt = train_ridge(X_lt, y_lt)

        if fi_lt is not None:
            fi_lt.to_csv(ensure_dir(os.path.join(OUT_DIR, f"fi_leadtime_SKU{sid}.csv")))
        if pi_lt is not None:
            pi_lt.to_csv(ensure_dir(os.path.join(OUT_DIR, f"pi_leadtime_SKU{sid}.csv")))
        coef_lt.to_csv(ensure_dir(os.path.join(OUT_DIR, f"ridge_coef_leadtime_SKU{sid}.csv")))
        beta_std_lt.to_csv(ensure_dir(os.path.join(OUT_DIR, f"ridge_betaStd_leadtime_SKU{sid}.csv")))

        if ENABLE_PLOTS and fi_lt is not None:
            plt.figure(figsize=(8,6)); topk(fi_lt,20).iloc[::-1].plot(kind="barh")
            plt.title(f"RF FI (LeadTime) - SKU{sid}"); savefig(os.path.join(OUT_DIR, f"fi_leadtime_SKU{sid}_top20.png"))

        lead_pack = {
            "rf": (rf_lt, spl_lt, m_rf_lt),
            "rg": (rg_lt, spl_rg_lt, m_rg_lt),
            "feats": X_lt.columns.tolist()
        }

        del X_lt; gc.collect()
    else:
        print(f"[WARN] SKU{sid}: LeadTime features empty → skip")

    # Production
    y_pr = sku_targets[sid]["production"].values
    exclude_pr = set(sku_targets[sid]["cell_cols"])
    X_pr = build_features_for_production(df, list(exclude_pr))

    prod_pack = None
    if X_pr.shape[1] > 0:
        rf_pr, spl_pr, m_rf_pr, fi_pr, pi_pr = train_rf(X_pr, y_pr)
        rg_pr, spl_rg_pr, m_rg_pr, coef_pr, beta_std_pr = train_ridge(X_pr, y_pr)

        if fi_pr is not None:
            fi_pr.to_csv(ensure_dir(os.path.join(OUT_DIR, f"fi_production_SKU{sid}.csv")))
        if pi_pr is not None:
            pi_pr.to_csv(ensure_dir(os.path.join(OUT_DIR, f"pi_production_SKU{sid}.csv")))
        coef_pr.to_csv(ensure_dir(os.path.join(OUT_DIR, f"ridge_coef_production_SKU{sid}.csv")))
        beta_std_pr.to_csv(ensure_dir(os.path.join(OUT_DIR, f"ridge_betaStd_production_SKU{sid}.csv")))

        if ENABLE_PLOTS and fi_pr is not None:
            plt.figure(figsize=(8,6)); topk(fi_pr,20).iloc[::-1].plot(kind="barh")
            plt.title(f"RF FI (Production) - SKU{sid}"); savefig(os.path.join(OUT_DIR, f"fi_production_SKU{sid}_top20.png"))

        prod_pack = {
            "rf": (rf_pr, spl_pr, m_rf_pr),
            "rg": (rg_pr, spl_rg_pr, m_rg_pr),
            "feats": X_pr.columns.tolist()
        }

        del X_pr; gc.collect()

    per_sku_results[sid] = {"lead_time": lead_pack, "production": prod_pack}

[SPEED] limit SKUs → top 3: ['2', '3', '4']




[SPEED] Using HistGradientBoostingRegressor.
[SPEED] Using HistGradientBoostingRegressor.

=== AGG Metrics ===
{'RF_LT': 0.8874460359017269, 'RG_LT': {'MAE': 0.005, 'RMSE': 0.016, 'R2': 0.5805}, 'RF_PR': 0.9880771185836398, 'RG_PR': {'MAE': 120.036, 'RMSE': 167.721, 'R2': 0.9785}}
[SPEED] Using HistGradientBoostingRegressor.
[SPEED] Using HistGradientBoostingRegressor.
[SPEED] Using HistGradientBoostingRegressor.
[SPEED] Using HistGradientBoostingRegressor.
[SPEED] Using HistGradientBoostingRegressor.
[SPEED] Using HistGradientBoostingRegressor.


In [7]:
# ============================================
# 5) 간이 탄력도(민감도): splits 미사용, 전체 X 정렬 후 Top-N만 변동
# ============================================
from numpy.random import default_rng
RANDOM_STATE = 42
_rng = default_rng(RANDOM_STATE)

def pick_topN_features_from_X(X: pd.DataFrame, topk: int = ELAS_FEATURES_TOPK,
                              only_queue_util: bool = ONLY_QUEUE_UTIL) -> List[str]:
    cols = list(X.columns)
    if only_queue_util:
        base = [c for c in cols if c.endswith("_Queue") or c.endswith("_Util")]
        if base: return base[:topk]
    # fallback: 해석 편의상 BNECK 원-핫은 제외
    return [c for c in cols if not c.startswith("BNECK_")][:topk]

def align_X_for_model(X_built: pd.DataFrame, trained_feats: List[str]) -> pd.DataFrame:
    """학습 시 피처 이름/순서와 정확히 일치하도록 reindex."""
    return X_built.reindex(columns=trained_feats, fill_value=0.0)

def _suggest_h(x: np.ndarray) -> float:
    if x.size == 0: return 1.0
    q25, q75 = np.percentile(x, [25, 75]); iqr = q75 - q25
    if iqr > 0: return float(iqr * 0.05)
    std = np.std(x); return float(std * 0.1 if std > 0 else 1.0)

def fd_slope_mean(model, X_full: pd.DataFrame, feature: str, h: Optional[float]=None,
                  sample_n: int=ELAS_SAMPLE_N) -> Optional[float]:
    """X_full은 학습 피처 전부 포함(순서 동일). feature만 ±h로 변경."""
    if feature not in X_full.columns: return None
    n = len(X_full);  Xs = X_full
    if n == 0: return None
    if sample_n and sample_n < n:
        idx = _rng.choice(n, size=sample_n, replace=False)
        Xs = X_full.iloc[idx].copy()
    else:
        Xs = X_full.copy()
    x = Xs[feature].to_numpy()
    h = _suggest_h(x) if h is None else h
    if h == 0: return 0.0
    X_minus = Xs.copy(); X_plus = Xs.copy()
    X_minus[feature] = x - h
    X_plus[feature]  = x + h
    y_m = model.predict(X_minus).mean()
    y_p = model.predict(X_plus).mean()
    return float((y_p - y_m) / (2.0 * h))

def run_elasticity_block(tag: str, model, X_full: pd.DataFrame, features: List[str],
                         sample_n: int=ELAS_SAMPLE_N) -> pd.DataFrame:
    rows = []
    for f in features:
        s = fd_slope_mean(model, X_full, f, h=None, sample_n=sample_n)
        rows.append({"feature": f, "slope": s})
    out = pd.DataFrame(rows).dropna().sort_values("slope", ascending=False)
    out.to_csv(os.path.join(OUT_DIR, f"elasticity_{tag}.csv"), index=False)
    return out

elasticity_outputs = {}

# ---------- AGG (통합) ----------
# 학습 당시 피처 목록으로 정렬
X_base_lt_built = build_features_for_leadtime(df, exclude_cols=[], add_bneck_onehot=True, top_bneck_list=TOP_BNECKS)
X_full_lt = align_X_for_model(X_base_lt_built, FEATS_AG_LT)
feats_lt = pick_topN_features_from_X(X_full_lt, ELAS_FEATURES_TOPK, ONLY_QUEUE_UTIL)

X_base_pr_built = build_features_for_production(df, exclude_cols=[])
X_full_pr = align_X_for_model(X_base_pr_built, FEATS_AG_PR)
feats_pr = pick_topN_features_from_X(X_full_pr, ELAS_FEATURES_TOPK, ONLY_QUEUE_UTIL)

if feats_lt:
    el_lt = run_elasticity_block("leadtime_AGG_fast_lite", rf_lt_agg, X_full_lt, feats_lt, ELAS_SAMPLE_N)
    elasticity_outputs["leadtime_AGG"] = el_lt
    print("\n=== AGG 리드타임 탄력도 (상위 N개 피처) ===")
    display(el_lt.head(15))
else:
    print("[Elasticity] AGG 리드타임: 후보 피처 없음 → 건너뜀")

if feats_pr:
    el_pr = run_elasticity_block("production_AGG_fast_lite", rf_pr_agg, X_full_pr, feats_pr, ELAS_SAMPLE_N)
    elasticity_outputs["production_AGG"] = el_pr
    print("\n=== AGG 생산량 탄력도 (상위 N개 피처) ===")
    display(el_pr.head(15))
else:
    print("[Elasticity] AGG 생산량: 후보 피처 없음 → 건너뜀")

# ---------- SKU별 ----------
per_sku_elasticity = {}
for sid, packs in per_sku_results.items():
    per_sku_elasticity[sid] = {}

    # LeadTime
    if packs.get("lead_time") is not None:
        exclude_lt = set(sku_targets[sid]["cell_cols"])
        exclude_lt |= {c for c in df.columns if c.startswith(f"SKU{sid}_") and time_like_regex.search(c)}
        X_lt_sku_built = build_features_for_leadtime(df, list(exclude_lt), add_bneck_onehot=True, top_bneck_list=TOP_BNECKS)

        trained_feats_lt = packs["lead_time"]["feats"]
        X_lt_full = align_X_for_model(X_lt_sku_built, trained_feats_lt)
        cand_lt = pick_topN_features_from_X(X_lt_full, ELAS_FEATURES_TOPK, ONLY_QUEUE_UTIL)
        # 학습 당시 있었던 열만 남김
        cand_lt = [c for c in cand_lt if c in trained_feats_lt]
        if cand_lt:
            rf = packs["lead_time"]["rf"][0]
            out = run_elasticity_block(f"leadtime_SKU{sid}_fast_lite", rf, X_lt_full, cand_lt, ELAS_SAMPLE_N)
            per_sku_elasticity[sid]["lead_time"] = out

    # Production
    if packs.get("production") is not None:
        exclude_pr = set(sku_targets[sid]["cell_cols"])
        X_pr_sku_built = build_features_for_production(df, list(exclude_pr))

        trained_feats_pr = packs["production"]["feats"]
        X_pr_full = align_X_for_model(X_pr_sku_built, trained_feats_pr)
        cand_pr = pick_topN_features_from_X(X_pr_full, ELAS_FEATURES_TOPK, ONLY_QUEUE_UTIL)
        cand_pr = [c for c in cand_pr if c in trained_feats_pr]
        if cand_pr:
            rf = packs["production"]["rf"][0]
            out = run_elasticity_block(f"production_SKU{sid}_fast_lite", rf, X_pr_full, cand_pr, ELAS_SAMPLE_N)
            per_sku_elasticity[sid]["production"] = out

print("Elasticity (lite) CSVs saved to:", OUT_DIR)


=== AGG 리드타임 탄력도 (상위 N개 피처) ===


Unnamed: 0,feature,slope
1,Blanking_SKU1_Queue,0.008873
5,Press1_Util,0.004446
4,Blanking_SKU4_Queue,0.000852
3,Blanking_SKU3_Queue,-0.001244
2,Blanking_SKU2_Queue,-0.002123
6,Press2_Util,-0.040837
7,Press3_Util,-0.040982
0,Blanking_Util,-0.672543



=== AGG 생산량 탄력도 (상위 N개 피처) ===


Unnamed: 0,feature,slope
0,Blanking_Util,225.401591
5,Press1_Util,3.245672
2,Blanking_SKU2_Queue,1.616548
3,Blanking_SKU3_Queue,1.16634
1,Blanking_SKU1_Queue,0.0
4,Blanking_SKU4_Queue,0.0
6,Press2_Util,0.0
7,Press3_Util,0.0


Elasticity (lite) CSVs saved to: ./outputs


In [8]:
# ============================================
# 6) What‑if 시뮬레이터 (AGG & SKU별)
# ============================================
from typing import Dict, Tuple, List

def simulate_delta(model, X_base: pd.DataFrame, deltas: Dict[str, float]) -> Tuple[float, float, float]:
    X_sim = X_base.copy()
    for k, v in deltas.items():
        if k in X_sim.columns:
            X_sim[k] = X_sim[k] + v
    base_mean = float(model.predict(X_base).mean())
    new_mean  = float(model.predict(X_sim).mean())
    return base_mean, new_mean, new_mean - base_mean

# 간이 탄력도 셀에서 썼던 정렬 유틸이 없을 수도 있으니 가드
def align_X_for_model(X_built: pd.DataFrame, trained_feats: List[str]) -> pd.DataFrame:
    return X_built.reindex(columns=trained_feats, fill_value=0.0)

whatif = {"AGG": {"lead_time": {}, "production": {}}, "SKU": {}}

# ---------- AGG(통합)용 베이스 X 재구성 & 정렬 ----------
# LeadTime
X_lt_agg_built = build_features_for_leadtime(df, exclude_cols=[], add_bneck_onehot=True, top_bneck_list=TOP_BNECKS)
X_full_lt_agg  = align_X_for_model(X_lt_agg_built, FEATS_AG_LT)  # 학습 시 피처 순서/이름에 맞춤
sc_lt = get_scenarios(X_full_lt_agg.columns)
for name, v in sc_lt.items():
    base_m, new_m, d_m = simulate_delta(rf_lt_agg, X_full_lt_agg, {name: v})
    whatif["AGG"]["lead_time"][f"{name}_{v:+g}"] = {"base_mean": base_m, "new_mean": new_m, "delta_mean": d_m}

# Production
X_pr_agg_built = build_features_for_production(df, exclude_cols=[])
X_full_pr_agg  = align_X_for_model(X_pr_agg_built, FEATS_AG_PR)
sc_pr = get_scenarios(X_full_pr_agg.columns)
for name, v in sc_pr.items():
    base_m, new_m, d_m = simulate_delta(rf_pr_agg, X_full_pr_agg, {name: v})
    whatif["AGG"]["production"][f"{name}_{v:+g}"] = {"base_mean": base_m, "new_mean": new_m, "delta_mean": d_m}

# ---------- SKU별 ----------
for sid, packs in per_sku_results.items():
    whatif["SKU"][sid] = {}

    # LeadTime
    if packs["lead_time"] is not None:
        # 해당 SKU 리드타임 피처 재구성
        exclude_lt = set(sku_targets[sid]["cell_cols"])
        exclude_lt |= {c for c in df.columns if c.startswith(f"SKU{sid}_") and time_like_regex.search(c)}
        X_lt_sku_built = build_features_for_leadtime(df, list(exclude_lt), add_bneck_onehot=True, top_bneck_list=TOP_BNECKS)

        trained_feats_lt = packs["lead_time"]["feats"]  # 학습 시 사용한 전체 피처 목록
        X_lt_full = align_X_for_model(X_lt_sku_built, trained_feats_lt)

        lt_map = {}
        for name, v in WHATIF_SCENARIOS.items():
            if name in X_lt_full.columns:  # 존재하는 피처만 시나리오 적용
                base_m, new_m, d_m = simulate_delta(packs["lead_time"]["rf"][0], X_lt_full, {name: v})
                lt_map[f"{name}_{v:+g}"] = {"base_mean": base_m, "new_mean": new_m, "delta_mean": d_m}
        whatif["SKU"][sid]["lead_time"] = lt_map

    # Production
    if packs["production"] is not None:
        exclude_pr = set(sku_targets[sid]["cell_cols"])
        X_pr_sku_built = build_features_for_production(df, list(exclude_pr))

        trained_feats_pr = packs["production"]["feats"]
        X_pr_full = align_X_for_model(X_pr_sku_built, trained_feats_pr)

        pr_map = {}
        for name, v in WHATIF_SCENARIOS.items():
            if name in X_pr_full.columns:
                base_m, new_m, d_m = simulate_delta(packs["production"]["rf"][0], X_pr_full, {name: v})
                pr_map[f"{name}_{v:+g}"] = {"base_mean": base_m, "new_mean": new_m, "delta_mean": d_m}
        whatif["SKU"][sid]["production"] = pr_map

# 저장 및 출력
with open(ensure_dir(os.path.join(OUT_DIR, "whatif_results.json")), "w", encoding="utf-8") as f:
    json.dump(whatif, f, ensure_ascii=False, indent=2)

# --- 설명 출력 ---
print("\n[What-if 시뮬레이션 설명]")
print("특정 변수(Queue, Util 등)를 인위적으로 ± 변화시켰을 때,")
print("리드타임과 생산량 평균이 얼마나 달라지는지를 계산한 결과입니다.")
print("→ base_mean: 원래 평균값, new_mean: 변화 후 평균값, delta_mean: 차이")

print("\n=== What-if (AGG) ===")
print(json.dumps(whatif["AGG"], ensure_ascii=False, indent=2))


[What-if 시뮬레이션 설명]
특정 변수(Queue, Util 등)를 인위적으로 ± 변화시켰을 때,
리드타임과 생산량 평균이 얼마나 달라지는지를 계산한 결과입니다.
→ base_mean: 원래 평균값, new_mean: 변화 후 평균값, delta_mean: 차이

=== What-if (AGG) ===
{
  "lead_time": {
    "Blanking_Queue_-50": {
      "base_mean": 2.510594421587846,
      "new_mean": 1.8318075097749484,
      "delta_mean": -0.6787869118128977
    },
    "Warehouse1_Queue_-50": {
      "base_mean": 2.510594421587846,
      "new_mean": 2.5106675099329148,
      "delta_mean": 7.308834506858375e-05
    },
    "Quality_Util_+5": {
      "base_mean": 2.510594421587846,
      "new_mean": 2.510832725805052,
      "delta_mean": 0.00023830421720560935
    }
  },
  "production": {
    "Blanking_Queue_-50": {
      "base_mean": 54748.176045612076,
      "new_mean": 54746.31936813327,
      "delta_mean": -1.856677478805068
    },
    "Warehouse1_Queue_-50": {
      "base_mean": 54748.176045612076,
      "new_mean": 54748.18151975673,
      "delta_mean": 0.005474144651088864
    },
    "Quality_Util_+5": {


In [9]:
# ============================================
# 7a) 품질 단계 탈락(불량) 프록시 생성
#  - Paint1~2 Util, Quality Util/Queue, 최종 생산량 증가분(Δ)로 간이 지표 구성
#  - 프록시: 탈락이 커 보일수록 값 증가 (0~1 정규화)
# ============================================
import numpy as np
import pandas as pd

def _safe_col(df, name, default=0.0):
    return pd.to_numeric(df[name], errors="coerce").fillna(default).astype("float32") if name in df.columns else pd.Series(default, index=df.index, dtype="float32")

# 1) 필요한 원천 시계열
u_p1   = _safe_col(df, "Paint1_Util")
u_p2   = _safe_col(df, "Paint2_Util")
u_q    = _safe_col(df, "Quality_Util")
q_q    = _safe_col(df, "Quality_Queue")
tot    = _safe_col(df, "c_TotalProducts")

# 2) 최종 산출 증가분(Δ) = 생산량의 1스텝 차분 (음수는 0으로 클립)
#    큰 데이터엔 약간의 노이즈가 있어 이동평균으로 부드럽게
delta_final_raw = tot.diff().fillna(0.0).clip(lower=0).astype("float32")
delta_final     = delta_final_raw.rolling(window=5, min_periods=1).mean().astype("float32")

# 3) 상·하류 "활동 지수" 구성 (스케일 프리)
#    - 상류 활동: Paint 가동 (병렬 합) & Quality 가동
#    - 하류 산출: 최종 산출 증가분
upstream_idx = (u_p1 + u_p2 + u_q).astype("float32")                   # 상류 가동 강도
quality_load = (q_q).astype("float32")                                 # 품질 대기열(적체)
downstream_y = (delta_final).astype("float32")                          # 실제 산출

# 4) z-정규화 (분산 0 방지)
def _z(x):
    mu, sd = float(np.nanmean(x)), float(np.nanstd(x))
    if not np.isfinite(sd) or sd == 0:
        return pd.Series(0.0, index=df.index, dtype="float32")
    return ((x - mu) / sd).astype("float32")

zu   = _z(upstream_idx)
zq   = _z(quality_load)
zy   = _z(downstream_y)

# 5) 간이 탈락 프록시: 상류활동↑ + 품질적체↑ 대비 산출↓
#    → 값이 클수록 "탈락/재작업/통과율 저하" 의심
raw_defect_proxy = (zu + zq - zy).astype("float32")

# 6) 0~1 범위로 클리핑 & 정규화 (이동창으로 완만화)
win = 25
proxy_smooth = raw_defect_proxy.rolling(window=win, min_periods=1).mean()
mn, mx = float(proxy_smooth.quantile(0.01)), float(proxy_smooth.quantile(0.99))
den = (mx - mn) if (mx - mn) > 1e-6 else 1.0
quality_reject_proxy = ((proxy_smooth - mn) / den).clip(0, 1).astype("float32")

# 7) 타깃 시리즈로 df에 부착
df["_quality_reject_proxy"] = quality_reject_proxy
print("[DEFECT PROXY] '_quality_reject_proxy' 생성 완료. 예시 통계:",
      {"min": float(quality_reject_proxy.min()),
       "median": float(quality_reject_proxy.median()),
       "max": float(quality_reject_proxy.max())})

[DEFECT PROXY] '_quality_reject_proxy' 생성 완료. 예시 통계: {'min': 0.0, 'median': 0.5141528248786926, 'max': 1.0}


In [10]:
# 병목 ↔ 불량 proxy 관계 집계
if "_bneck" in df.columns and "_quality_reject_proxy" in df.columns:
    tmp = pd.DataFrame({
        "bneck": df["_bneck"],
        "defect_proxy": df["_quality_reject_proxy"].astype("float32")
    })
    stats = tmp.groupby("bneck").agg(
        count=("defect_proxy", "count"),
        mean_defect=("defect_proxy", "mean"),
        std_defect=("defect_proxy", "std")
    ).sort_values("mean_defect", ascending=False).reset_index()

    print("\n=== 병목 ↔ 불량 Proxy 관계 ===")
    display(stats.head(15))  # 상위 15개만 보기
else:
    print("bneck 또는 defect proxy 컬럼이 없습니다.")


=== 병목 ↔ 불량 Proxy 관계 ===


Unnamed: 0,bneck,count,mean_defect,std_defect
0,Warehouse_4_Queue,36,0.553382,0.187337
1,Press2_Queue,5,0.528022,0.216524
2,Warehouse1_Queue,77631,0.521246,0.210814
3,Warehouse_3_Queue,1307,0.50442,0.216034
4,Forklift_Blanking_Queue,53681,0.497501,0.211187
5,Forklift_Press_Queue,9,0.479962,0.117338
6,Press4_Queue,6,0.447602,0.117323
7,Blanking_SKU3_Queue,1,0.0,


In [11]:
# ============================================
# 7) 결함/불량 타깃 자동 탐지 (프록시 기반) · 동일 파이프라인
#  - '_quality_reject_proxy'를 타깃으로 모델 학습/해석
# ============================================

# 타깃 이름
tgt_name = "_quality_reject_proxy"

# 피처 구성 시 타깃을 명시적으로 제외 (병목 one-hot 포함)
X_df = build_features_for_leadtime(
    df,
    exclude_cols=[tgt_name],            # ★ 누수 차단 포인트
    add_bneck_onehot=True,
    top_bneck_list=TOP_BNECKS
)

y_df = df[tgt_name].astype("float32").values

# 모델 재학습
rf, spl, m_rf, fi, pi = train_rf(X_df, y_df)
rg, spl2, m_rg, coef, beta_std = train_ridge(X_df, y_df)

print("[Proxy Defect - no leakage] RF score on train:", m_rf)
print("[Proxy Defect - no leakage] Ridge metrics:", m_rg)

print("\n=== Defect Proxy - Ridge 표준화 계수 상위 (누수 제거) ===")
top_beta = beta_std.sort_values(key=lambda s: s.abs(), ascending=False).head(15)
display(top_beta)

if fi is not None:
    print("\n=== Defect Proxy - 트리 중요도 상위 (누수 제거) ===")
    display(fi.sort_values(ascending=False).head(15))
else:
    print("No proxy target found. 먼저 7a 셀을 실행하여 '_quality_reject_proxy'를 만드세요.")

[SPEED] Using HistGradientBoostingRegressor.
[Proxy Defect - no leakage] RF score on train: 0.02340521780170579
[Proxy Defect - no leakage] Ridge metrics: {'MAE': 0.169, 'RMSE': 0.21, 'R2': 0.0129}

=== Defect Proxy - Ridge 표준화 계수 상위 (누수 제거) ===


c_Cell4__SKU3                    0.146085
c_Cell3__SKU3                   -0.096936
c_Cell1__SKU2                    0.073247
Quality_Queue                    0.072795
c_Cell1_SKU1                     0.071760
c_Cell1__SKU4                    0.060462
c_Cell4__SKU4                    0.046548
Press2_Util                     -0.029907
c_Cell2__SKU2                    0.027182
BNECK_Forklift_Blanking_Queue   -0.021411
Press2_Queue                     0.020604
BNECK_Warehouse1_Queue          -0.020366
Warehouse_2_Queue               -0.019236
Press3_Util                      0.018836
c_Cell2__SKU4                    0.012542
dtype: float64

No proxy target found. 먼저 7a 셀을 실행하여 '_quality_reject_proxy'를 만드세요.


In [12]:
# === 병목 ↔ 불량 Proxy: 신뢰도 필터 + 리프트 계산 + 타이틀/저장 ===
min_count = 200  # 표본수 하한 (원하면 500/1000 등으로 조절)
tgt_name = "_quality_reject_proxy"

if "_bneck" in df.columns and tgt_name in df.columns:
    overall_mean = float(df[tgt_name].mean())
    tmp = pd.DataFrame({"bneck": df["_bneck"], "defect_proxy": df[tgt_name].astype("float32")})
    stats = (
        tmp.groupby("bneck", observed=False)
           .agg(count=("defect_proxy","count"),
                mean_defect=("defect_proxy","mean"),
                std_defect=("defect_proxy","std"))
           .reset_index()
    )
    stats["lift_vs_overall"] = stats["mean_defect"] - overall_mean
    stats = stats.sort_values(["count","mean_defect"], ascending=[False,False])

    # 신뢰 구간: 표본수 기준 필터
    stats_strong = stats[stats["count"] >= min_count].copy()
    stats_weak   = stats[stats["count"] <  min_count].copy()

    # 저장
    out_main = ensure_dir(os.path.join(OUT_DIR, "bneck_vs_defectProxy_main.csv"))
    out_weak = ensure_dir(os.path.join(OUT_DIR, "bneck_vs_defectProxy_lowCount.csv"))
    stats_strong.to_csv(out_main, index=False)
    stats_weak.to_csv(out_weak, index=False)

    # 출력 (제목 포함)
    print(f"\n=== 병목 ↔ 불량 Proxy (표본수 ≥ {min_count}) — 전체 평균={overall_mean:.3f} ===")
    display(stats_strong.sort_values("lift_vs_overall", ascending=False).head(15))

    if len(stats_weak):
        print(f"\n=== 병목 ↔ 불량 Proxy (표본수 < {min_count}) — 참고용 ===")
        display(stats_weak.sort_values("lift_vs_overall", ascending=False).head(15))

else:
    print("bneck 또는 defect proxy 컬럼이 없습니다. 먼저 7a 프록시 생성과 병목 라벨링을 확인하세요.")


=== 병목 ↔ 불량 Proxy (표본수 ≥ 200) — 전체 평균=0.511 ===


Unnamed: 0,bneck,count,mean_defect,std_defect,lift_vs_overall
5,Warehouse1_Queue,77631,0.521246,0.210814,0.009774
6,Warehouse_3_Queue,1307,0.50442,0.216034,-0.007052
1,Forklift_Blanking_Queue,53681,0.497501,0.211187,-0.013971



=== 병목 ↔ 불량 Proxy (표본수 < 200) — 참고용 ===


Unnamed: 0,bneck,count,mean_defect,std_defect,lift_vs_overall
7,Warehouse_4_Queue,36,0.553382,0.187337,0.04191
3,Press2_Queue,5,0.528022,0.216524,0.01655
2,Forklift_Press_Queue,9,0.479962,0.117338,-0.03151
4,Press4_Queue,6,0.447602,0.117323,-0.06387
0,Blanking_SKU3_Queue,1,0.0,,-0.511472


In [13]:
import numpy as np

# 기본
tgt_name = "_quality_reject_proxy"
min_count = 200

overall_mean = float(df[tgt_name].mean())

tmp = pd.DataFrame({"bneck": df["_bneck"], "defect_proxy": df[tgt_name].astype("float32")})
stats = (
    tmp.groupby("bneck", observed=False)
       .agg(count=("defect_proxy","count"),
            mean_defect=("defect_proxy","mean"),
            std_defect=("defect_proxy","std"))
       .reset_index()
)

# 통계량
stats["se"] = stats["std_defect"]/np.sqrt(stats["count"].clip(lower=1))
stats["ci_low"]  = stats["mean_defect"] - 1.96*stats["se"]
stats["ci_high"] = stats["mean_defect"] + 1.96*stats["se"]
stats["lift_vs_overall"] = stats["mean_defect"] - overall_mean
stats["lift_low"]  = stats["ci_low"]  - overall_mean
stats["lift_high"] = stats["ci_high"] - overall_mean
stats["rel_lift_%"] = 100.0 * stats["lift_vs_overall"] / overall_mean
stats["z"] = (stats["mean_defect"] - overall_mean) / stats["se"].replace(0, np.nan)
stats["sig_95"] = np.where(stats["z"].abs() >= 1.96, "★", "")

# 표본수 필터
stats_strong = stats[stats["count"] >= min_count].copy()
stats_weak   = stats[stats["count"] <  min_count].copy()

# 정렬 및 저장
stats_strong = stats_strong.sort_values(["rel_lift_%","count"], ascending=[False,False])
stats_weak   = stats_weak.sort_values(["rel_lift_%","count"], ascending=[False,False])

stats_strong.to_csv(ensure_dir(os.path.join(OUT_DIR, "bneck_vs_defectProxy_main_CI.csv")), index=False)
stats_weak.to_csv(ensure_dir(os.path.join(OUT_DIR, "bneck_vs_defectProxy_lowCount_CI.csv")), index=False)

# 제목 + 출력
print(f"\n=== 병목 ↔ 불량 Proxy (표본≥{min_count}) — 전체 평균={overall_mean:.3f} ===")
cols_show = ["bneck","count","mean_defect","std_defect","lift_vs_overall","rel_lift_%","ci_low","ci_high","sig_95"]
display(stats_strong[cols_show].head(15))

if len(stats_weak):
    print(f"\n=== 병목 ↔ 불량 Proxy (표본<{min_count}) — 참고용 ===")
    display(stats_weak[cols_show].head(15))

# 한 줄 요약
if len(stats_strong):
    top = stats_strong.iloc[0]
    print(f"\n[요약] 표본≥{min_count} 중 lift 최대: {top['bneck']} "
          f"(lift={top['lift_vs_overall']:.3f}, rel={top['rel_lift_%']:.1f}%, 95% CI [{top['lift_low']:.3f}, {top['lift_high']:.3f}] {top['sig_95']})")


=== 병목 ↔ 불량 Proxy (표본≥200) — 전체 평균=0.511 ===


Unnamed: 0,bneck,count,mean_defect,std_defect,lift_vs_overall,rel_lift_%,ci_low,ci_high,sig_95
5,Warehouse1_Queue,77631,0.521246,0.210814,0.009774,1.910962,0.519763,0.522729,★
6,Warehouse_3_Queue,1307,0.50442,0.216034,-0.007052,-1.378685,0.492708,0.516132,
1,Forklift_Blanking_Queue,53681,0.497501,0.211187,-0.013971,-2.731488,0.495714,0.499288,★



=== 병목 ↔ 불량 Proxy (표본<200) — 참고용 ===


Unnamed: 0,bneck,count,mean_defect,std_defect,lift_vs_overall,rel_lift_%,ci_low,ci_high,sig_95
7,Warehouse_4_Queue,36,0.553382,0.187337,0.04191,8.194092,0.492186,0.614579,
3,Press2_Queue,5,0.528022,0.216524,0.01655,3.235831,0.338231,0.717814,
2,Forklift_Press_Queue,9,0.479962,0.117338,-0.03151,-6.160611,0.403301,0.556623,
4,Press4_Queue,6,0.447602,0.117323,-0.06387,-12.487465,0.353724,0.54148,
0,Blanking_SKU3_Queue,1,0.0,,-0.511472,-100.0,,,



[요약] 표본≥200 중 lift 최대: Warehouse1_Queue (lift=0.010, rel=1.9%, 95% CI [0.008, 0.011] ★)


In [14]:
import re

# 행별 "지배 SKU" 간이 추정: 각 행에서 SKU별 생산(모든 셀 합) 중 최댓값의 SKU
cell_sku_cols = [c for c in df.columns if re.match(r"^c_Cell\d+__?SKU\d+$", c)]
sku_ids = sorted(set(re.findall(r"SKU(\d+)", " ".join(cell_sku_cols))), key=lambda x:int(x))

sku_by_row = pd.DataFrame({
    s: df.filter(regex=fr"^c_Cell\d+__?SKU{s}$").sum(axis=1).astype("float32")
    for s in sku_ids
})
df["_sku_dom"] = sku_by_row.idxmax(axis=1).str.extract(r"(\d+)")
tmp2 = pd.DataFrame({"bneck": df["_bneck"], "sku": df["_sku_dom"], "defect_proxy": df[tgt_name]})
mat = tmp2.pivot_table(index="bneck", columns="sku", values="defect_proxy", aggfunc="mean")
print("\n=== 병목 × SKU 평균 불량 Proxy ===")
display(mat)


=== 병목 × SKU 평균 불량 Proxy ===


sku,1,2,3,4
bneck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blanking_SKU3_Queue,0.0,,,
Forklift_Blanking_Queue,0.497078,0.506164,0.494624,0.49436
Forklift_Press_Queue,0.407522,0.275847,0.516404,0.527135
Press2_Queue,,0.528022,,
Press4_Queue,,,,0.447602
Warehouse1_Queue,0.524061,0.525224,0.517986,0.512809
Warehouse_3_Queue,,0.543214,0.502751,0.520273
Warehouse_4_Queue,,0.483111,0.498602,0.588946


In [15]:
import numpy as np
import pandas as pd

tgt = "_quality_reject_proxy"
bnk = "_bneck"
focus = "Warehouse1_Queue"
min_count = 200  # 표본 하한

# SKU 도메인 라벨(앞서 만든 _sku_dom이 없으면 간이 지배 SKU 재계산)
if "_sku_dom" not in df.columns:
    import re
    cell_sku_cols = [c for c in df.columns if re.match(r"^c_Cell\d+__?SKU\d+$", c)]
    sku_ids = sorted(set(re.findall(r"SKU(\d+)", " ".join(cell_sku_cols))), key=lambda x:int(x))
    sku_by_row = pd.DataFrame({s: df.filter(regex=fr"^c_Cell\d+__?SKU{s}$").sum(axis=1).astype("float32") for s in sku_ids})
    df["_sku_dom"] = sku_by_row.idxmax(axis=1).str.extract(r"(\d+)")

res = []
for sku in sorted(df["_sku_dom"].dropna().unique(), key=lambda x:int(x)):
    sub = df[df["_sku_dom"] == sku]
    a = sub[sub[bnk] == focus][tgt].dropna()
    b = sub[sub[bnk] != focus][tgt].dropna()
    if len(a) >= min_count and len(b) >= min_count:
        mean_a, mean_b = float(a.mean()), float(b.mean())
        lift = mean_a - mean_b
        rel = 100.0 * lift / (float(sub[tgt].mean()) + 1e-9)
        se = np.sqrt(a.var(ddof=1)/len(a) + b.var(ddof=1)/len(b))
        z = (lift / se) if se > 0 else np.nan
        res.append({"SKU": sku, "n_focus": len(a), "n_other": len(b),
                    "mean_focus": mean_a, "mean_other": mean_b,
                    "lift": lift, "rel_lift_%": rel, "z_approx": z})

sku_cmp = pd.DataFrame(res).sort_values(["rel_lift_%","n_focus"], ascending=[False,False])
print(f"\n=== {focus} 병목 vs 비-병목 (SKU별, n≥{min_count}) ===")
display(sku_cmp)


=== Warehouse1_Queue 병목 vs 비-병목 (SKU별, n≥200) ===


Unnamed: 0,SKU,n_focus,n_other,mean_focus,mean_other,lift,rel_lift_%,z_approx
0,1,27738,1633,0.524061,0.496719,0.027342,5.232476,5.138274
2,3,14526,19623,0.517987,0.49514,0.022847,4.525352,9.890108
1,2,21558,13539,0.525224,0.506246,0.018978,3.664424,8.181109
3,4,13809,20250,0.512809,0.494511,0.018297,3.645392,7.884097


In [16]:
# -6~+6 스텝에서 상관계수 계산 (데이터 간격에 맞춰 해석)
series_x = df["Warehouse1_Queue"].astype("float32")
series_y = df["_quality_reject_proxy"].astype("float32")

def corr_lag(x, y, lag):
    if lag > 0:
        return x.shift(lag).corr(y)
    elif lag < 0:
        return x.corr(y.shift(-lag))
    else:
        return x.corr(y)

lags = range(-6, 7)
corrs = {lag: corr_lag(series_x, series_y, lag) for lag in lags}
print("\n=== Warehouse1_Queue ↔ Defect Proxy 상관 (래그) ===")
for lag in sorted(corrs):
    print(f"Lag {lag:+d}: {corrs[lag]:.4f}")


=== Warehouse1_Queue ↔ Defect Proxy 상관 (래그) ===
Lag -6: 0.0042
Lag -5: 0.0041
Lag -4: 0.0051
Lag -3: 0.0056
Lag -2: 0.0051
Lag -1: 0.0053
Lag +0: 0.0620
Lag +1: 0.0625
Lag +2: 0.0620
Lag +3: 0.0614
Lag +4: 0.0605
Lag +5: 0.0636
Lag +6: 0.0631


In [17]:
x = df["Warehouse1_Queue"].astype("float32")
y = df["_quality_reject_proxy"].astype("float32")
bins = pd.qcut(x, 10, duplicates="drop")  # 분위수 10구간
trend = pd.DataFrame({"bin": bins, "proxy": y}).groupby("bin").agg(
    n=("proxy","count"), proxy_mean=("proxy","mean")).reset_index()
print("\n=== 창고1 Queue 분위수별 불량 Proxy 평균 ===")
display(trend)


=== 창고1 Queue 분위수별 불량 Proxy 평균 ===


  trend = pd.DataFrame({"bin": bins, "proxy": y}).groupby("bin").agg(


Unnamed: 0,bin,n,proxy_mean
0,"(-0.001, 108.222]",13268,0.488169
1,"(108.222, 124.823]",13268,0.497153
2,"(124.823, 138.934]",13267,0.50014
3,"(138.934, 153.135]",13268,0.501666
4,"(153.135, 168.504]",13267,0.507277
5,"(168.504, 186.166]",13268,0.514875
6,"(186.166, 208.301]",13267,0.518185
7,"(208.301, 240.038]",13268,0.522366
8,"(240.038, 296.61]",13267,0.527139
9,"(296.61, 1828.135]",13268,0.537749


In [18]:
thr = df["Warehouse1_Queue"].quantile(0.80)
df["_w1q_risk"] = (df["Warehouse1_Queue"] >= thr).astype(int)

summary = df.groupby("_w1q_risk")["_quality_reject_proxy"].agg(["count","mean","std"])
print(f"[임계치] Warehouse1_Queue ≥ {thr:.1f} 를 고위험으로 간주")
display(summary)

[임계치] Warehouse1_Queue ≥ 240.0 를 고위험으로 간주


Unnamed: 0_level_0,count,mean,std
_w1q_risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,106140,0.506231,0.211275
1,26536,0.532436,0.21023


In [19]:
q_bins = pd.qcut(df["Quality_Util"], q=3, labels=["low","mid","high"])
def layer_lift(layer):
    sub = df[q_bins == layer]
    a = sub[sub["_bneck"]=="Warehouse1_Queue"]["_quality_reject_proxy"]
    b = sub[sub["_bneck"]!="Warehouse1_Queue"]["_quality_reject_proxy"]
    return pd.Series({
        "n_a": len(a), "n_b": len(b),
        "mean_a": a.mean(), "mean_b": b.mean(),
        "lift": a.mean()-b.mean()
    })
print("\n[층화] Quality_Util 레벨별 창고1 병목 lift")
display(pd.concat([layer_lift("low"), layer_lift("mid"), layer_lift("high")], axis=1).T)


[층화] Quality_Util 레벨별 창고1 병목 lift


Unnamed: 0,n_a,n_b,mean_a,mean_b,lift
0,22901.0,21326.0,0.500582,0.480096,0.020487
1,26020.0,18205.0,0.520903,0.500898,0.020005
2,28710.0,15514.0,0.538039,0.518102,0.019937


In [20]:
import numpy as np
rows=[]
for sku in sorted(df["_sku_dom"].dropna().unique(), key=lambda x:int(x)):
    sub = df[df["_sku_dom"]==sku]
    a = sub[sub["_bneck"]=="Warehouse1_Queue"]["_quality_reject_proxy"].dropna()
    b = sub[sub["_bneck"]!="Warehouse1_Queue"]["_quality_reject_proxy"].dropna()
    if len(a)>1 and len(b)>1:
        lift = a.mean()-b.mean()
        se = np.sqrt(a.var(ddof=1)/len(a) + b.var(ddof=1)/len(b))
        ci = (lift-1.96*se, lift+1.96*se)
        rows.append({"SKU":sku,"n_a":len(a),"n_b":len(b),"lift":lift,"ci_low":ci[0],"ci_high":ci[1]})
display(pd.DataFrame(rows).sort_values("lift", ascending=False))

Unnamed: 0,SKU,n_a,n_b,lift,ci_low,ci_high
0,1,27738,1633,0.027342,0.016912,0.037771
2,3,14526,19623,0.022847,0.018319,0.027374
1,2,21558,13539,0.018978,0.014431,0.023525
3,4,13809,20250,0.018297,0.013749,0.022846


In [21]:
# ============================================
# 8) (옵션) mu/sd 기반 편차 파생피처 (기본 OFF)
# ============================================
# 필요 시 패턴만 남겨둠. SPEED_MODE=True면 기본적으로 사용 안 함.
MUSD: Dict[str, Dict[str, Dict[str, float]]] = {
    "SKU1": {
        "mu": {"BL":900.0, "PR":5.0, "AS":25.0, "PA":5400.0, "QL":55.0},
        "sd": {"BL":30.0,  "PR":0.1, "AS":0.1,  "PA":0.0,    "QL":2.04},
    }
}
REPLICATE_TO_OTHERS = True

def add_musd_features(df_: pd.DataFrame, sku_ids_: List[str]) -> List[str]:
    created = []
    for sid in sku_ids_:
        key = f"SKU{sid}"
        if key not in MUSD and REPLICATE_TO_OTHERS and "SKU1" in MUSD:
            MUSD[key] = MUSD["SKU1"]
        if key not in MUSD:
            continue
        mu = MUSD[key]["mu"]; sd = MUSD[key]["sd"]
        mu_sum = mu["BL"] + mu["PR"] + mu["AS"] + mu["PA"] + mu["QL"]
        S_var  = sd["BL"]**2 + sd["PR"]**2 + sd["AS"]**2 + sd["QL"]**2
        S_std  = math.sqrt(S_var) if S_var > 0 else 1.0
        base = f"SKU{sid}"
        va_col = next((base+s for s in LT_PATTERNS["VA"] if (base+s) in df_.columns), None)
        VA = pd.to_numeric(df_[va_col], errors="coerce").fillna(0.0) if va_col else pd.Series(0.0, index=df_.index)
        dev_col = f"{base}_VA_dev"; z_col = f"{base}_VA_z"
        df_[dev_col] = (VA - mu_sum).astype("float32")
        df_[z_col]   = ((VA - mu_sum)/S_std).astype("float32")
        created += [dev_col, z_col]
    return created

# 사용 예)
# if not SPEED_MODE and INCLUDE_MUSD_IN_PR:
#     musd_cols = add_musd_features(df, sku_ids)
#     print("MUSD features created:", musd_cols)

In [22]:
# 필요할 때만 수동 호출
musd_cols = add_musd_features(df, sku_ids)
print("MUSD features created:", musd_cols)

MUSD features created: ['SKU1_VA_dev', 'SKU1_VA_z', 'SKU2_VA_dev', 'SKU2_VA_z', 'SKU3_VA_dev', 'SKU3_VA_z', 'SKU4_VA_dev', 'SKU4_VA_z']


In [23]:
# SUMMARY 직전에 추가
def pick_existing(*names):
    for n in names:
        if os.path.exists(os.path.join(OUT_DIR, n)):
            return n
    return names[0]  # 없으면 원래 이름 유지

summary["outputs"]["agg_fi_pi_lt"] = [
    "fi_leadtime_AGG.csv",
    "pi_leadtime_AGG.csv",
    "fi_leadtime_AGG_top20.png",
    "ridge_coef_leadtime_AGG.csv",
    "ridge_betaStd_leadtime_AGG.csv",
    pick_existing("elasticity_leadtime_AGG_fast.csv",
                  "elasticity_leadtime_AGG_fast_lite.csv")
]
summary["outputs"]["agg_fi_pi_pr"] = [
    "fi_production_AGG.csv",
    "pi_production_AGG.csv",
    "fi_production_AGG_top20.png",
    "ridge_coef_production_AGG.csv",
    "ridge_betaStd_production_AGG.csv",
    pick_existing("elasticity_production_AGG_fast.csv",
                  "elasticity_production_AGG_fast_lite.csv")
]

KeyError: 'outputs'

In [None]:
# ============================================
# 9) 요약 리포트 저장
# ============================================
summary = {
    "cwd": os.getcwd(),
    "dataset_shape": tuple(df.shape),
    "detected_skus": sku_ids,
    "top_bottlenecks_used": TOP_BNECKS,
    "metrics": {
        "AGG": {
            "RF_LT": m_rf_lt_agg, "RG_LT": m_rg_lt_agg,
            "RF_PR": m_rf_pr_agg, "RG_PR": m_rg_pr_agg,
        }
    },
    "outputs": {
        "bottleneck_total": os.path.join(OUT_DIR, "bottleneck_totalProduction.csv"),
        "bottleneck_aggLT": os.path.join(OUT_DIR, "bottleneck_aggLeadTime.csv"),
        "sku_bneck_effect": "sku_bneck_effect_SKU*.csv",
        "agg_fi_pi_lt": [
            "fi_leadtime_AGG.csv","pi_leadtime_AGG.csv","fi_leadtime_AGG_top20.png",
            "ridge_coef_leadtime_AGG.csv","ridge_betaStd_leadtime_AGG.csv",
            "elasticity_leadtime_AGG_fast.csv"
        ],
        "agg_fi_pi_pr": [
            "fi_production_AGG.csv","pi_production_AGG.csv","fi_production_AGG_top20.png",
            "ridge_coef_production_AGG.csv","ridge_betaStd_production_AGG.csv",
            "elasticity_production_AGG_fast.csv"
        ],
        "per_sku_elasticity": "elasticity_(leadtime|production)_SKU*_fast.csv",
        "whatif": os.path.join(OUT_DIR, "whatif_results.json")
    },
    "notes": [
        "생산량 모델에서 c_TotalProducts, c_Cell* 피처 제외 → 타깃 누수 방지.",
        "리드타임 모델에서 *_Time/_sec/_LeadTime/_Total 피처 제외 → 누수 방지.",
        "병목 라벨(_bneck) 원-핫 추가로 병목 상태의 평균적 영향 반영.",
        "탄력도: FD(샘플링)으로 빠르게 ∂ŷ/∂x 추정.",
        "SPEED_MODE=True일 때 HGBR 우선, fi/pi가 None일 수 있음(가드 적용).",
        f"SKU 상위 {TOP_SKU_K}개만 학습해 전체 시간 단축."
    ]
}
with open(ensure_dir(os.path.join(OUT_DIR, "SUMMARY.json")), "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("\n=== SUMMARY ===")
print(json.dumps(summary, ensure_ascii=False, indent=2))