In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 데이터 로딩
df = pd.read_csv("Final_data.csv", encoding='utf-8')

# 연도 추출
df['년도'] = df['기준_년분기_코드'].astype(str).str[:4].astype(int)

# 타겟 이진화 (기준 조정 가능)
df['폐업_여부'] = (df['폐업_률'] >= 0.03).astype(int)

# 클래스 분포 확인
print("✅ [타겟 클래스 비율]")
print(df['폐업_여부'].value_counts(normalize=True))

# train/test 분리
train_df = df[df['년도'] < 2024].copy()
test_df = df[df['년도'] == 2024].copy()

drop_cols = ['폐업_률', '폐업_여부']
X_train = train_df.drop(columns=drop_cols)
y_train = train_df['폐업_여부']
X_test = test_df.drop(columns=drop_cols)
y_test = test_df['폐업_여부']

# 3. 문자열 인코딩
label_encoders = {}
for col in X_train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le

# 4. 모델 정의
models = {
    "Random Forest": RandomForestClassifier(random_state=42, class_weight="balanced"),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight="balanced")
}

# 5. 모델별 성능 수집
results = []
for name, model in models.items():
    print(f"▶ {name} 학습 중...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else np.nan
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append([name, acc, roc, recall, precision, f1])

# 6. 결과 저장 및 출력
df_results = pd.DataFrame(results, columns=["모델", "정확도", "ROC-AUC", "이탈고객 Recall", "이탈고객 Precision", "F1-score"])
df_results = df_results.sort_values(by="F1-score", ascending=False).reset_index(drop=True)
df_results.to_csv("model_comparison_results.csv", index=False)
print("✅ 성능 비교 결과 저장 완료 → model_comparison_results.csv")
print(df_results)
