In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

PROJECT_ROOT = Path("/Users/jy/project_2nd/SKN23-2nd-3Team")

DATA_PATH = PROJECT_ROOT / "data" / "processed" / "features_ml_clean.parquet"
MODELS_ML_ROOT = PROJECT_ROOT / "models" / "ml"
PREP_ROOT = PROJECT_ROOT / "models" / "preprocessing"

FEATURE_ORDER = [
    "n_events_30d",
    "active_days_30d",
    "n_purchase_30d",
    "purchase_ratio",
    "days_since_last_event",
    "days_since_last_purchase",
    "brand_concentration_ratio",
    "brand_switch_count_30d",
    "total_spend_30d",
    "activity_ratio_15d",
    "price_volatility",
    "n_events_7d",
    "visit_regularity",
    "activity_trend",
]

# --------------------------
# 1) parquet stats 로드 (분포 체크용)
# --------------------------
def load_feature_stats(df: pd.DataFrame, cols: list[str]) -> dict:
    stats = {}
    for c in cols:
        s = df[c].dropna()
        if len(s) == 0:
            stats[c] = dict(mean=0.0, std=1.0, median=0.0, min=0.0, max=1.0, p01=0.0, p05=0.0, p95=1.0, p99=1.0)
            continue
        std = float(s.std())
        stats[c] = dict(
            mean=float(s.mean()),
            std=(std if std != 0 else 1.0),
            median=float(s.median()),
            min=float(s.min()),
            max=float(s.max()),
            p01=float(s.quantile(0.01)),
            p05=float(s.quantile(0.05)),
            p95=float(s.quantile(0.95)),
            p99=float(s.quantile(0.99)),
        )
    return stats

df_feat = pd.read_parquet(DATA_PATH, columns=FEATURE_ORDER)
STATS = load_feature_stats(df_feat, FEATURE_ORDER)

# --------------------------
# 2) 모델/스케일러 로드
# --------------------------
MODEL_NAME = "hgb"
VERSION = "baseline"

model_path = MODELS_ML_ROOT / MODEL_NAME / VERSION / "model.pkl"
scaler_path = PREP_ROOT / MODEL_NAME / VERSION / "scaler.pkl"

assert model_path.exists(), f"model not found: {model_path}"
model = joblib.load(model_path)

scaler = joblib.load(scaler_path) if scaler_path.exists() else None

print("model:", type(model))
print("scaler:", type(scaler) if scaler is not None else None)
print("scaler_path exists?:", scaler_path.exists())

# --------------------------
# 3) 체크 유틸
# --------------------------
def build_x_raw_df(user_inputs: dict, feature_order: list[str]) -> pd.DataFrame:
    missing = [c for c in feature_order if c not in user_inputs]
    if missing:
        raise KeyError(f"missing inputs for: {missing}")
    return pd.DataFrame([[float(user_inputs[c]) for c in feature_order]], columns=feature_order)

def make_input_report(user_inputs: dict, stats: dict, feature_order: list[str]) -> pd.DataFrame:
    rows = []
    for c in feature_order:
        v = float(user_inputs[c])
        s = stats[c]
        rows.append({
            "feature": c,
            "value": v,
            "p01": s["p01"], "p99": s["p99"],
            "p05": s["p05"], "p95": s["p95"],
            "median": s["median"],
            "flag_out_p01_p99": (v < s["p01"]) or (v > s["p99"]),
            "flag_out_p05_p95": (v < s["p05"]) or (v > s["p95"]),
        })
    return pd.DataFrame(rows)

def zscore_by_stats(x_raw_df: pd.DataFrame, stats: dict) -> pd.DataFrame:
    # scaler가 없는데도 “표준화된 입력”이 필요할 때만 사용(기본은 raw 그대로 추천)
    xs = []
    for c in x_raw_df.columns:
        mean = float(stats[c]["mean"])
        std = float(stats[c]["std"]) if float(stats[c]["std"]) != 0 else 1.0
        xs.append((float(x_raw_df.iloc[0][c]) - mean) / std)
    return pd.DataFrame([xs], columns=x_raw_df.columns)

def predict_with_debug(model, scaler, user_inputs: dict, when_no_scaler="raw"):
    """
    when_no_scaler:
      - "raw": scaler 없으면 raw 그대로 predict (HGB/트리계열 기본 추천)
      - "zscore": scaler 없으면 stats zscore로 변환해서 predict (모델이 그렇게 학습된 경우에만)
    """
    x_raw_df = build_x_raw_df(user_inputs, FEATURE_ORDER)

    rep = make_input_report(user_inputs, STATS, FEATURE_ORDER)
    display(rep)

    if scaler is not None:
        # sklearn 1.0+ 이면 feature_names_in_ 체크 가능
        if hasattr(scaler, "feature_names_in_"):
            ok = list(scaler.feature_names_in_) == FEATURE_ORDER
            print("Matches FEATURE_ORDER exactly?:", ok)
            if not ok:
                raise ValueError("FEATURE_ORDER mismatch with scaler.feature_names_in_")

        x_scaled = scaler.transform(x_raw_df)
        if not np.isfinite(x_scaled).all():
            raise ValueError("scaled features contain NaN/inf")
        x_df = pd.DataFrame(x_scaled, columns=FEATURE_ORDER)
    else:
        if when_no_scaler == "zscore":
            x_df = zscore_by_stats(x_raw_df, STATS)
        else:
            x_df = x_raw_df.copy()

    display(x_raw_df)
    display(x_df)

    # 예측 (sklearn classifier 가정)
    prob = float(model.predict_proba(x_df)[:, 1][0])
    print("pred prob:", prob, f"({prob*100:.2f}%)",
          "| used:", ("scaler" if scaler is not None else when_no_scaler))
    return prob, rep, x_raw_df, x_df

# --------------------------
# 4) Streamlit에 넣은 값 그대로 테스트
# --------------------------
user_inputs = {
    "n_events_30d": 2,                 # 활동 빈도 (UI=2)
    "active_days_30d": 1,              # 실질 활동 일수 (UI=1)
    "n_purchase_30d": 0,               # 구매 횟수(30일) (UI=0)
    "purchase_ratio": 0.0,             # 구매 전환율 (UI=0.00)

    "days_since_last_event": 7,        # 최근 활동 경과일 (UI=7)
    "days_since_last_purchase": 31,    # 마지막 구매 경과일 (UI=31)

    "brand_concentration_ratio": 1.0,  # 브랜드 집중도 (UI=1.00)
    "brand_switch_count_30d": 0,       # 브랜드 전환 (UI=0)

    "total_spend_30d": 0,              # 총 구매 금액 (UI=0)
    "activity_ratio_15d": 0.5,         # 최근 15일 활동 비율 (UI=0.50)

    "price_volatility": 0.0,           # 가격 민감도 (UI=0.00)
    "n_events_7d": 0,                  # 최근 7일 활동 수 (UI=0)

    "visit_regularity": 0.0,           # 방문 규칙성 (UI=0.00)
    "activity_trend": 0.0,             # 활동 추세 (UI=0.00)
}

# HGB면 보통 raw가 정답
prob, rep, x_raw_df, x_df = predict_with_debug(model, scaler, user_inputs, when_no_scaler="raw")

# --------------------------
# 5) (옵션) parquet에서 실제 1행 뽑아서 재현 체크
# --------------------------
sample_raw = df_feat.sample(1, random_state=42)[FEATURE_ORDER].iloc[0].to_dict()
print("\n[Sample from parquet] raw values")
display(pd.DataFrame([sample_raw]))

prob2, rep2, _, _ = predict_with_debug(model, scaler, sample_raw, when_no_scaler="raw")

model: <class 'sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier'>
scaler: None
scaler_path exists?: False


Unnamed: 0,feature,value,p01,p99,p05,p95,median,flag_out_p01_p99,flag_out_p05_p95
0,n_events_30d,2.0,0.0,35.0,1.0,15.0,2.0,False,False
1,active_days_30d,1.0,0.0,6.0,1.0,3.0,1.0,False,False
2,n_purchase_30d,0.0,0.0,13.0,0.0,4.0,0.0,False,False
3,purchase_ratio,0.0,0.0,0.212121,0.0,0.105263,0.0,False,False
4,days_since_last_event,7.0,0.182569,28.742535,0.380057,25.42733,7.218993,False,False
5,days_since_last_purchase,31.0,0.618282,31.0,4.51541,31.0,31.0,False,False
6,brand_concentration_ratio,1.0,0.0,1.0,0.5,1.0,1.0,False,False
7,brand_switch_count_30d,0.0,0.0,4.0,0.0,1.0,0.0,False,False
8,total_spend_30d,0.0,0.0,903.6425,0.0,288.3,0.0,False,False
9,activity_ratio_15d,0.5,0.0,1.0,0.0,1.0,0.5,False,False


Unnamed: 0,n_events_30d,active_days_30d,n_purchase_30d,purchase_ratio,days_since_last_event,days_since_last_purchase,brand_concentration_ratio,brand_switch_count_30d,total_spend_30d,activity_ratio_15d,price_volatility,n_events_7d,visit_regularity,activity_trend
0,2.0,1.0,0.0,0.0,7.0,31.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0


Unnamed: 0,n_events_30d,active_days_30d,n_purchase_30d,purchase_ratio,days_since_last_event,days_since_last_purchase,brand_concentration_ratio,brand_switch_count_30d,total_spend_30d,activity_ratio_15d,price_volatility,n_events_7d,visit_regularity,activity_trend
0,2.0,1.0,0.0,0.0,7.0,31.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0


pred prob: 0.8280121954049378 (82.80%) | used: raw

[Sample from parquet] raw values


Unnamed: 0,n_events_30d,active_days_30d,n_purchase_30d,purchase_ratio,days_since_last_event,days_since_last_purchase,brand_concentration_ratio,brand_switch_count_30d,total_spend_30d,activity_ratio_15d,price_volatility,n_events_7d,visit_regularity,activity_trend
0,1.0,1.0,0.0,0.0,19.649005,31.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0


Unnamed: 0,feature,value,p01,p99,p05,p95,median,flag_out_p01_p99,flag_out_p05_p95
0,n_events_30d,1.0,0.0,35.0,1.0,15.0,2.0,False,False
1,active_days_30d,1.0,0.0,6.0,1.0,3.0,1.0,False,False
2,n_purchase_30d,0.0,0.0,13.0,0.0,4.0,0.0,False,False
3,purchase_ratio,0.0,0.0,0.212121,0.0,0.105263,0.0,False,False
4,days_since_last_event,19.649005,0.182569,28.742535,0.380057,25.42733,7.218993,False,False
5,days_since_last_purchase,31.0,0.618282,31.0,4.51541,31.0,31.0,False,False
6,brand_concentration_ratio,1.0,0.0,1.0,0.5,1.0,1.0,False,False
7,brand_switch_count_30d,0.0,0.0,4.0,0.0,1.0,0.0,False,False
8,total_spend_30d,0.0,0.0,903.6425,0.0,288.3,0.0,False,False
9,activity_ratio_15d,0.0,0.0,1.0,0.0,1.0,0.5,False,False


Unnamed: 0,n_events_30d,active_days_30d,n_purchase_30d,purchase_ratio,days_since_last_event,days_since_last_purchase,brand_concentration_ratio,brand_switch_count_30d,total_spend_30d,activity_ratio_15d,price_volatility,n_events_7d,visit_regularity,activity_trend
0,1.0,1.0,0.0,0.0,19.649005,31.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0


Unnamed: 0,n_events_30d,active_days_30d,n_purchase_30d,purchase_ratio,days_since_last_event,days_since_last_purchase,brand_concentration_ratio,brand_switch_count_30d,total_spend_30d,activity_ratio_15d,price_volatility,n_events_7d,visit_regularity,activity_trend
0,1.0,1.0,0.0,0.0,19.649005,31.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0


pred prob: 0.9295517799631802 (92.96%) | used: raw
