# LightGBM을 사용한 ML(폐업률 평균값 기준으로 이진분류)

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    precision_recall_curve
)
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

In [58]:
# 데이터 로드
df = pd.read_csv('../../eda/data/merged_data.csv')

In [59]:
df.head()

Unnamed: 0,기준_년분기_코드,자치구_코드_명,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_률,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,...,남성연령대_30_직장_인구_수,남성연령대_40_직장_인구_수,남성연령대_50_직장_인구_수,남성연령대_60_이상_직장_인구_수,여성연령대_10_직장_인구_수,여성연령대_20_직장_인구_수,여성연령대_30_직장_인구_수,여성연령대_40_직장_인구_수,여성연령대_50_직장_인구_수,여성연령대_60_이상_직장_인구_수
0,20191,강남구,PC방,122.0,132.0,9.8,13.0,6.8,9.0,10.0,...,178522.0,164466.0,100728.0,44021.0,1679.0,99408.0,108205.0,74466.0,54523.0,22838.0
1,20191,강남구,가구,435.0,436.0,2.8,12.0,0.7,3.0,1.0,...,178522.0,164466.0,100728.0,44021.0,1679.0,99408.0,108205.0,74466.0,54523.0,22838.0
2,20191,강남구,가방,279.0,280.0,4.3,12.0,3.6,10.0,1.0,...,178522.0,164466.0,100728.0,44021.0,1679.0,99408.0,108205.0,74466.0,54523.0,22838.0
3,20191,강남구,가전제품,163.0,163.0,1.8,3.0,1.2,2.0,0.0,...,178522.0,164466.0,100728.0,44021.0,1679.0,99408.0,108205.0,74466.0,54523.0,22838.0
4,20191,강남구,가전제품수리,92.0,92.0,3.3,3.0,1.1,1.0,0.0,...,178522.0,164466.0,100728.0,44021.0,1679.0,99408.0,108205.0,74466.0,54523.0,22838.0


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39975 entries, 0 to 39974
Columns: 137 entries, 기준_년분기_코드 to 여성연령대_60_이상_직장_인구_수
dtypes: float64(132), int64(2), object(3)
memory usage: 41.8+ MB


In [61]:
df.describe()

Unnamed: 0,기준_년분기_코드,점포_수,유사_업종_점포_수,개업_률,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,당월_매출_금액,당월_매출_건수,...,남성연령대_30_직장_인구_수,남성연령대_40_직장_인구_수,남성연령대_50_직장_인구_수,남성연령대_60_이상_직장_인구_수,여성연령대_10_직장_인구_수,여성연령대_20_직장_인구_수,여성연령대_30_직장_인구_수,여성연령대_40_직장_인구_수,여성연령대_50_직장_인구_수,여성연령대_60_이상_직장_인구_수
count,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,...,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0,39975.0
mean,20220.096635,320.453959,353.995822,2.971897,11.176735,2.589829,9.794196,33.541864,15535880000.0,574952.1,...,27142.451857,26395.84733,19736.054484,9543.76035,223.790644,15057.239225,17365.139937,13473.701138,10014.440951,4391.736785
std,18.764856,624.817403,648.030094,2.385172,21.887316,2.001541,19.292389,71.25685,46790560000.0,1482439.0,...,39941.977062,36960.317249,24219.922727,9600.950081,399.06533,22371.870231,24534.250632,16564.421275,11715.352615,4832.596737
min,20191.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,7135.0,2.0,...,1169.0,1978.0,2704.0,2245.0,0.0,635.0,1180.0,1674.0,1663.0,257.0
25%,20203.0,94.0,105.0,1.3,2.0,1.2,2.0,0.0,1197476000.0,14630.0,...,5590.0,5799.0,5397.0,3928.0,0.0,3138.0,3795.0,4042.0,3856.0,1695.0
50%,20221.0,161.0,188.0,2.6,5.0,2.3,4.0,4.0,4114511000.0,71595.0,...,9123.0,10094.0,10122.0,5924.0,39.0,5068.0,6673.0,6807.0,5695.0,2678.0
75%,20234.0,313.0,357.0,4.2,12.0,3.6,10.0,27.0,12519530000.0,364112.5,...,24849.0,24451.0,18373.0,10207.0,209.0,21296.0,21114.0,14062.0,8866.0,4775.0
max,20252.0,17827.0,17845.0,43.6,773.0,33.3,554.0,688.0,1399137000000.0,27169730.0,...,204697.0,188865.0,116356.0,51339.0,1679.0,114671.0,124729.0,86333.0,63848.0,26816.0


In [62]:
# =========================================================
# LightGBM + SMOTE + Stratified K-Fold(5) + GridSearchCV
# Split: train 0.7 / val 0.1 / test 0.2 (all stratified)
# 평가: ACC/PREC/RECALL/F1/ROC-AUC/PR-AUC (VAL에서 F1 기준 임계값 -> TEST 적용)
# =========================================================

RANDOM_STATE = 42

# --- 타깃 생성: 상위 25% → 1 ---
assert '폐업_률' in df.columns, "'폐업_률' 컬럼이 필요합니다."
df = df.dropna(subset=['폐업_률']).copy()

cut_75 = df['폐업_률'].quantile(0.75)   # 75% 분위수
df['폐업률_등급'] = (df['폐업_률'] > cut_75).astype(int)

print(f"임계값(75% 분위수) = {cut_75:.4f}")
print("라벨 분포:", df['폐업률_등급'].value_counts().to_dict())

# 불필요 컬럼 제거
drop_cols = ['폐업_점포_수', '폐업_영업_개월_평균', '서울시_폐업_영업_개월_평균', '기준_년분기_코드']
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

임계값(75% 분위수) = 3.6000
라벨 분포: {0: 30153, 1: 9822}


In [63]:
# ---------- 1) 특징/타깃 분리 ----------
target = '폐업률_등급'
X = df.drop(columns=['폐업_률', target], errors='ignore')
y = df[target].astype(int)

In [64]:
# ---------- 2) 열 타입에 따라 전처리 파이프라인 구성 ----------
cat_cols = [c for c in X.columns if X[c].dtype.name in ['object', 'category']]
num_cols = [c for c in X.columns if c not in cat_cols]

# 범주형: OrdinalEncoder(unknown=-1), 수치형: 그대로
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols),
        ('num', 'passthrough', num_cols)
    ],
    remainder='drop'
)

In [65]:
# ---------- 3) 데이터 분할: train 0.7 / val 0.1 / test 0.2 ----------
# 1) 먼저 test 0.2 분리
X_trval, X_test, y_trval, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
# 2) 남은 0.8 중에서 train 0.7 & val 0.1 비율로 분리 (→ 0.8 * 0.125 = 0.1)
X_train, X_val, y_train, y_val = train_test_split(
    X_trval, y_trval,
    test_size=0.125,  # 0.8 * 0.125 = 0.1
    stratify=y_trval, random_state=RANDOM_STATE
)

print("분포 확인:",
      "\n  Train:", y_train.value_counts().to_dict(),
      "\n  Val  :", y_val.value_counts().to_dict(),
      "\n  Test :", y_test.value_counts().to_dict()
)

분포 확인: 
  Train: {0: 21106, 1: 6876} 
  Val  : {0: 3016, 1: 982} 
  Test : {0: 6031, 1: 1964}


In [66]:
# ---------- 4) 파이프라인: Preprocess -> SMOTE -> LGBM ----------
# SMOTE는 파이프라인 내부에 있어 "훈련 폴드"에서만 적용됩니다.
pipeline = Pipeline(steps=[
    ('prep', preprocess),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('clf', lgb.LGBMClassifier(
        objective='binary',
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

In [67]:
# === (교체) 5) Stratified K-Fold(5) + Optuna (n_trials=50) ===
# 전처리: from imblearn.pipeline import Pipeline 로 구성된 pipeline이어야 합니다.
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.base import clone
import numpy as np

# ── 안정성 가드 ─────────────────────────────────────────────────────────
# 필수 객체가 존재하는지 확인
for name in ["pipeline", "X_train", "y_train", "X_val", "y_val", "RANDOM_STATE"]:
    assert name in globals(), f"{name} 이(가) 정의되어 있어야 합니다."

# pipeline 구성 점검
assert hasattr(pipeline, "named_steps"), "pipeline은 imblearn.pipeline.Pipeline 이어야 합니다."
assert "prep" in pipeline.named_steps, "pipeline.named_steps에 'prep' 스텝이 필요합니다."
assert "clf" in pipeline.named_steps, "pipeline.named_steps에 'clf' 스텝이 필요합니다."

# Stratified K-Fold(5)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# ── Optuna 목적함수: PR-AUC(average_precision) 최대화 ────────────────
def objective(trial):
    params = {
        "clf__n_estimators":    trial.suggest_int(   "clf__n_estimators", 800, 5000),
        "clf__learning_rate":   trial.suggest_float( "clf__learning_rate", 0.01, 0.1, log=True),
        "clf__num_leaves":      trial.suggest_int(   "clf__num_leaves", 31, 255),
        "clf__max_depth":       trial.suggest_int(   "clf__max_depth", -1, 12),
        "clf__min_child_samples": trial.suggest_int( "clf__min_child_samples", 20, 200),
        "clf__subsample":       trial.suggest_float( "clf__subsample", 0.6, 1.0),
        "clf__subsample_freq":  trial.suggest_int(   "clf__subsample_freq", 0, 1),
        "clf__colsample_bytree":trial.suggest_float( "clf__colsample_bytree", 0.6, 1.0),
        "clf__reg_lambda":      trial.suggest_float( "clf__reg_lambda", 0.0, 2.0),
        "clf__reg_alpha":       trial.suggest_float( "clf__reg_alpha", 0.0, 1.0),
        "clf__max_bin":         trial.suggest_int(   "clf__max_bin", 255, 511),
    }

    # 파이프라인( prep → SMOTE → LGBM )에 파라미터 주입
    pipe = pipeline.set_params(**params)

    # 교차검증으로 평균 PR-AUC 계산
    scores = cross_val_score(
        pipe, X_train, y_train,
        scoring="average_precision",
        cv=cv,
        n_jobs=-1
    )
    return float(np.mean(scores))

# ── Optuna 최적화 실행 (n_trials=50) ────────────────────────────────
sampler = optuna.samplers.TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50)

print("\n=== Optuna 결과 ===")
print("Best score (CV-PR-AUC):", round(study.best_value, 4))
print("Best params:", study.best_params)

# ── 최적 파이프라인 생성 ────────────────────────────────────────────
best_pipe = pipeline.set_params(**study.best_params)

# ── 전처리기 선-fit 후, VAL 미리 변환 (NotFittedError 예방) ─────────────
prep = clone(best_pipe.named_steps["prep"]).fit(X_train, y_train)
X_val_t = prep.transform(X_val)

# ── 조기 종료용 리핏: 트리 수 크게 설정 후 VAL로 early_stopping ───────
best_pipe.set_params(clf__n_estimators=5000)

best_pipe.fit(
    X_train, y_train,
    # eval_set은 파이프라인을 거치지 않고 최종 모델로 직접 전달됨 → 전처리된 X_val_t 사용
    clf__eval_set=[(X_val_t, y_val)],
    clf__eval_metric='auc',
    clf__callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
)

best_clf = best_pipe.named_steps["clf"]
print("best_iteration_ (with early stopping on VAL):", getattr(best_clf, "best_iteration_", None))

from imblearn.pipeline import Pipeline as ImbPipeline
assert isinstance(pipeline, ImbPipeline), "pipeline은 imblearn.pipeline.Pipeline 여야 합니다."
y_train = y_train.astype(int); y_val = y_val.astype(int)  # LGBM 이진 라벨 안전 캐스팅

[I 2025-10-14 01:39:44,905] A new study created in memory with name: no-name-ae888aa5-ad34-4209-a0cc-f637be8020b8
[I 2025-10-14 01:41:10,186] Trial 0 finished with value: 0.6311779677297049 and parameters: {'clf__n_estimators': 2373, 'clf__learning_rate': 0.08927180304353628, 'clf__num_leaves': 195, 'clf__max_depth': 7, 'clf__min_child_samples': 48, 'clf__subsample': 0.662397808134481, 'clf__subsample_freq': 0, 'clf__colsample_bytree': 0.9464704583099741, 'clf__reg_lambda': 1.2022300234864176, 'clf__reg_alpha': 0.7080725777960455, 'clf__max_bin': 260}. Best is trial 0 with value: 0.6311779677297049.
[I 2025-10-14 01:41:36,475] Trial 1 finished with value: 0.6055930906225429 and parameters: {'clf__n_estimators': 4874, 'clf__learning_rate': 0.06798962421591129, 'clf__num_leaves': 78, 'clf__max_depth': 1, 'clf__min_child_samples': 53, 'clf__subsample': 0.7216968971838151, 'clf__subsample_freq': 1, 'clf__colsample_bytree': 0.7727780074568463, 'clf__reg_lambda': 0.5824582803960838, 'clf__re


=== Optuna 결과 ===
Best score (CV-PR-AUC): 0.6569
Best params: {'clf__n_estimators': 842, 'clf__learning_rate': 0.024132320155915276, 'clf__num_leaves': 55, 'clf__max_depth': 10, 'clf__min_child_samples': 124, 'clf__subsample': 0.9041517965320417, 'clf__subsample_freq': 0, 'clf__colsample_bytree': 0.9700505870963767, 'clf__reg_lambda': 0.9560541609924877, 'clf__reg_alpha': 0.5666380894826497, 'clf__max_bin': 427}
[LightGBM] [Info] Number of positive: 21106, number of negative: 21106
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55931
[LightGBM] [Info] Number of data points in the train set: 42212, number of used features: 132
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
best_iteration_ (with early stopping on VAL): 698


In [68]:
# ---------- 6) (선택) 조기 종료를 활용한 Refit: train→val ----------
# GridSearchCV는 폴드 내부에서 학습하므로 early_stopping을 쓰기 까다롭습니다.
# 최적 파이프라인으로 "train(0.7)"에 다시 적합하면서, val(0.1)을 eval_set으로 전달하여 조기 종료를 적용합니다.
# - 파이프라인의 마지막 스텝 이름이 'clf'이므로, fit params는 'clf__' prefix로 전달합니다.
best_pipe.set_params(clf__n_estimators=5000)  # 충분히 크게 두고 조기 종료에 맡기기
best_pipe.fit(
    X_train, y_train,
    clf__eval_set=[(best_pipe.named_steps['prep'].transform(X_val), y_val)],
    clf__eval_metric='auc',
    clf__callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
)

best_clf = best_pipe.named_steps['clf']
print("best_iteration_ (with early stopping on VAL):", getattr(best_clf, 'best_iteration_', None))

[LightGBM] [Info] Number of positive: 21106, number of negative: 21106
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55931
[LightGBM] [Info] Number of data points in the train set: 42212, number of used features: 132
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
best_iteration_ (with early stopping on VAL): 698


In [69]:
# ---------- 7) 임계값 튜닝: VAL에서 F1 최대점 ----------
from sklearn.metrics import precision_recall_curve

val_proba = best_pipe.predict_proba(X_val)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, val_proba)
f1s = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
best_idx = int(np.nanargmax(f1s))
best_thr = float(thresholds[best_idx])
print(f"Best threshold from VAL(F1 max): {best_thr:.4f}")

def eval_all(y_true, proba, thr):
    pred = (proba >= thr).astype(int)
    return {
        'ACC': round(accuracy_score(y_true, pred), 4),
        'PREC': round(precision_score(y_true, pred, zero_division=0), 4),
        'RECALL': round(recall_score(y_true, pred, zero_division=0), 4),
        'F1': round(f1_score(y_true, pred, zero_division=0), 4),
        'ROC_AUC': round(roc_auc_score(y_true, proba), 4),
        'PR_AUC': round(average_precision_score(y_true, proba), 4),
        'CM': confusion_matrix(y_true, pred)
    }

Best threshold from VAL(F1 max): 0.3950


In [70]:
# ---------- 8) VAL / TEST 평가 ----------
val_metrics = eval_all(y_val, val_proba, best_thr)
test_proba = best_pipe.predict_proba(X_test)[:, 1]
test_metrics = eval_all(y_test, test_proba, best_thr)

print("\n=== VAL Metrics (thr from VAL) ===")
for k, v in val_metrics.items():
    print(f"{k}: {v}")

print("\n=== TEST Metrics (thr from VAL) ===")
for k, v in test_metrics.items():
    print(f"{k}: {v}")


=== VAL Metrics (thr from VAL) ===
ACC: 0.7939
PREC: 0.5712
RECALL: 0.6456
F1: 0.6061
ROC_AUC: 0.8313
PR_AUC: 0.6397
CM: [[2540  476]
 [ 348  634]]

=== TEST Metrics (thr from VAL) ===
ACC: 0.792
PREC: 0.5669
RECALL: 0.6497
F1: 0.6055
ROC_AUC: 0.8362
PR_AUC: 0.6473
CM: [[5056  975]
 [ 688 1276]]


In [72]:
# ---------- 9) 중요도(상위 15) ----------
# ColumnTransformer + OrdinalEncoder를 썼으므로, 최종 피처 순서를 복원합니다.
# (주의) 인코딩 후의 컬럼명은 cat/num 합쳐진 순서이므로 간단히 이름을 재구성합니다.
enc_cat_names = [f"[CAT]{c}" for c in cat_cols]
feat_names = enc_cat_names + num_cols
importances = pd.Series(best_clf.feature_importances_, index=feat_names).sort_values(ascending=False)
print("\nTop 15 Feature Importances:")
print(importances.head(15))


Top 15 Feature Importances:
[CAT]서비스_업종_코드_명    2932
서울시_운영_영업_개월_평균     1839
개업_점포_수             1727
프랜차이즈_점포_수          1372
개업_률                1291
점포_수                1001
유사_업종_점포_수           876
시간대_06_11_매출_금액      732
전체임대료                709
시간대_건수~06_매출_건수      637
연령대_60_이상_매출_건수      605
유흥_지출_총금액            564
연령대_60_이상_매출_금액      551
시간대_00_06_매출_금액      550
시간대_건수~24_매출_건수      524
dtype: int32
