In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
# Python이 이미 한 번 임포트한 모듈을 캐시해두기 때문에 리로드 필요
import importlib
import machine_learning_02
importlib.reload(machine_learning_02)
from machine_learning_02 import data_load, data_split

#### 1차적으로 best5 모델로 생각되는 모델 돌리기

In [3]:
class Models:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.models = {
            "Logistic Regression": LogisticRegression(random_state=42),
            "Random Forest": RandomForestClassifier( random_state=42),
            "SVM": SVC(probability=True, random_state=42),
            "XGBoost": XGBClassifier(random_state=42),
            "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
        }
        self.results = {}

    def fit_and_evaluate(self):
        for name, model in self.models.items():
            model.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            y_prob = model.predict_proba(self.X_test)[:, 1]

            report = classification_report(self.y_test, y_pred, output_dict=True)
            auc_score = roc_auc_score(self.y_test, y_prob)

            self.results[name] = {
                "Precision": report["1"]["precision"],
                "Recall": report["1"]["recall"],
                "F1-score": report["1"]["f1-score"],
                "Accuracy": report["accuracy"],
                "ROC AUC": auc_score
            }

    def get_results(self):
        return pd.DataFrame(self.results).T.sort_values("ROC AUC", ascending=False)

# 데이터 로드
data_df = data_load()

# 데이터셋 준비
# X_train, X_test, y_train, y_test = data_split(data_df, scaler_nm="standard")
X_train, X_test, y_train, y_test = data_split(data_df, scaler_nm='quantile')

# 모델 학습 및 평가
model_runner = Models(X_train, X_test, y_train, y_test)
model_runner.fit_and_evaluate()

# 결과 출력
results_df = model_runner.get_results()
print(results_df)

                     Precision    Recall  F1-score  Accuracy   ROC AUC
CatBoost              0.861979  0.822566  0.841811  0.828692  0.906262
XGBoost               0.862845  0.819757  0.840749  0.827914  0.904124
Random Forest         0.850066  0.800718  0.824654  0.811311  0.890397
SVM                   0.818511  0.739700  0.777113  0.764874  0.842538
Logistic Regression   0.769135  0.774657  0.771886  0.746282  0.825207


#### 스케일링 + remaining_contract 칼럼 제거X + XGBoost/CatBoost 의 평가지표가 제일 높음
#### 그러나, 과적합을 방지하기 위해 remaining_contract 칼럼 제거하여 학습시킬 필요성 있어보임