### 라이브러리 import

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

from joblib import dump

random_state=51

### 데이터 로드

In [12]:
# 데이터 로드
bank_df = pd.read_csv('../../data/BankChurners.csv')
bank_df

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0.000093,0.999910
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,0.000057,0.999940
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.000,0.000021,0.999980
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.760,0.000134,0.999870
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.500,0.000,0.000022,0.999980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,...,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462,0.000191,0.999810
10123,710638233,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,...,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511,0.995270,0.004729
10124,716506083,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,...,5409.0,0,5409.0,0.819,10291,60,0.818,0.000,0.997880,0.002118
10125,717406983,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,...,5281.0,0,5281.0,0.535,8395,62,0.722,0.000,0.996710,0.003294


### 전처리

In [13]:
# 필요없는 칼럼 제거
drop_columns = ['CLIENTNUM',
                'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
                                
                # 'Total_Trans_Amt',
                'Total_Trans_Ct',
                'Total_Relationship_Count',
                'Total_Revolving_Bal',                
                'Total_Amt_Chng_Q4_Q1',
                'Total_Ct_Chng_Q4_Q1'
]
bank_df = bank_df.drop(columns=drop_columns)

# bank_df = bank_df.loc[:, ~bank_df.columns.str.startswith("Total_")]

# 이탈여부 값 변환
bank_df['Attrition_Flag'] = bank_df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

### 원핫 인코딩

In [14]:
# 범주형 칼럼
cate_columns = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

# 원핫 인코딩
encoder = OneHotEncoder()
encoded_cate = encoder.fit_transform(bank_df[cate_columns]).toarray()
encoded_cate_df = pd.DataFrame(data=encoded_cate, columns=encoder.get_feature_names_out(cate_columns))

# 원래 데이터에서 범주형 칼럼 제거
bank_df = bank_df.drop(columns=cate_columns)

# 인코딩된 데이터와 결합
bank_df = pd.concat([bank_df, encoded_cate_df], axis=1)

# display(bank_df)

### 학습 & 평가 데이터 분리

In [None]:
X = bank_df.drop(columns=['Attrition_Flag'])
y = bank_df['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

    #####################

    # 스케일링
    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # 모델 선정
    models = {
        # "Logistic Regression": LogisticRegression(random_state=random_state),
        # "Random Forest": RandomForestClassifier(random_state=random_state),
        "XGBoost": XGBClassifier(random_state=random_state),
        # "LightGBM" : LGBMClassifier(random_state=random_state)
    }

    # 모델 학습 및 평가
    for name, model in models.items():
        model.fit(X_train, y_train)    
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # ROC-AUC 계산을 위한 확률값
        
        # 평가 지표 출력
        print(f"{name} ========== Default")
        # print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
        # print(f"Precision : {precision_score(y_test, y_pred):.4f}")
        # print(f"Recall : {recall_score(y_test, y_pred):.4f}")
        # print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")
        # print(f"ROC-AUC : {roc_auc_score(y_test, y_pred_proba):.4f}")
        print(f"\n>>>> Classification Report\n{classification_report(y_test, y_pred)}")

        # 특성 중요도 확인
        # if (name == 'Logistic Regression'):
        #     coef_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_[0]})
        #     print("\n>>>> Feature Coefficients\n", coef_importance.sort_values(by='Coefficient', ascending=False))
        # else:
        # feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
        # print("\n>>>> Feature Importance\n", feature_importance.sort_values(by='Importance', ascending=False))

        print("\n" + "="*30 + "\n")

    # 오버샘플링 적용 (평균기준)
    # 복제, 생성, 
    smote = RandomOverSampler(random_state=random_state)
    X_train_resample, y_train_resample = smote.fit_resample(X_train, y_train)






    # 모델 선정
    models = {
        # "Logistic Regression": LogisticRegression(random_state=random_state),
        # "Random Forest": RandomForestClassifier(random_state=random_state),
        "XGBoost": XGBClassifier(random_state=random_state),
        # "LightGBM" : LGBMClassifier(random_state=random_state)
    }

    # 파라미터 설정
    param_grids = {
        # "Logistic Regression": {
        #     'C': [0.01, 0.1, 1, 10],              # 규제 강도
        #     'penalty': ['l1', 'l2'],               # 규제 유형
        #     'solver': ['liblinear']                # l1과 l2 모두 지원하는 solver
        # },
        "Random Forest": {
            'n_estimators': [50, 100, 200],                 # 트리 개수
            'max_depth': [3, 5],                          # 최대 깊이
            'min_samples_split': [2, 5, 10],                # 노드 분할 최소 샘플 : 값이 클수록 트리가 덜 복잡해져 과적합을 줄이는 효과
            'min_samples_leaf': [1, 2, 4],                  # 리프 노드 최소 샘플 : 값이 크면 모델이 단순 (클래스 불균형이 심하면 크게 설정)
            'max_features': ['sqrt', 'log2', 0.3, 0.5],     # 특성 샘플링 비율 : 각 트리에서 사용할 특성의 최대 개수 (무작위성을 높여 모델의 다양성을 증가)
            'class_weight' : ['balanced']                   # 클래스 가중치 : 클래스 불균형을 해결하기 위해 클래스에 가중치를 부여
        },
        "XGBoost": {
            'n_estimators': [50, 100, 200],                 # 트리 개수
            'max_depth': [1, 3, 5, 10],                     # 최대 깊이 : XGBoost는 깊이가 얕아도 잘 작동한다!
            'learning_rate': [0.01, 0.05, 0.1, 0.3],        # 학습률
            'subsample': [0.6, 0.8, 1.0],                   # 각 트리 학습에 사용할 데이터 샘플 비율 : 값이 낮을수록 과적합 방지
        },
        "LightGBM": {
            'n_estimators': [50, 100, 200],
            'max_depth': [2, 5, 10],
            'learning_rate': [0.01, 0.05, 0.1, 0.3],
            'num_leaves': [20, 31, 50],                     # 한 트리의 최대 리프 노드 수 : 2^(max_depth)보다 작아야 과적합을 줄이는 데 유리
            'reg_lambda' : [0.1, 1.0]                       # L2 규제 : 과적합을 방지하고 모델을 안정화
        },
    }

    # 모델 학습 및 평가
    for name, model in models.items():
        # 파라미터 학습
        print(f"\nGridSearchCV Search Best Params for {name}..............................")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='f1', n_jobs=-1, verbose=1)
        grid_search.fit(X_train_resample, y_train_resample)
        
        # 최적 모델 선정
        best_model = grid_search.best_estimator_    
        print(f">>>> Best Parameters for {name}\n{grid_search.best_params_}")
        
        # 교차 검증
        print(f"Cross Val Score : {cross_val_score(best_model, X_train_resample, y_train_resample, scoring='f1', cv=5)}")

        # 예측
        y_pred = best_model.predict(X_test)
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # ROC-AUC 계산을 위한 확률값
        
        # 평가 지표 출력
        print(f"{name} ========== Optimization Param")
        # print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
        # print(f"Precision : {precision_score(y_test, y_pred):.4f}")
        # print(f"Recall : {recall_score(y_test, y_pred):.4f}")
        # print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")
        # print(f"ROC-AUC : {roc_auc_score(y_test, y_pred_proba):.4f}")
        print(f"\n>>>> Classification Report\n{classification_report(y_test, y_pred)}")

        # 특성 중요도 확인
        # if (name == "Logistic Regression"):
        #     coef_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': best_model.coef_[0]})
        #     print("\n>>>> Feature Coefficients\n", coef_importance.sort_values(by='Coefficient', ascending=False))
        # else:
        # feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_model.feature_importances_})
        # print("\n>>>> Feature Importance\n", feature_importance.sort_values(by='Importance', ascending=False))

        print("\n" + "="*100 + "\n")





 random_state = 0

>>>> Classification Report
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1701
           1       0.82      0.77      0.79       325

    accuracy                           0.94      2026
   macro avg       0.89      0.87      0.88      2026
weighted avg       0.93      0.94      0.94      2026




GridSearchCV Search Best Params for XGBoost..............................
Fitting 5 folds for each of 144 candidates, totalling 720 fits
>>>> Best Parameters for XGBoost
{'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0}
Cross Val Score : [0.97736256 0.97736256 0.97560976 0.97769784 0.97736256]

>>>> Classification Report
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1701
           1       0.83      0.80      0.82       325

    accuracy                           0.94      2026
   macro avg       0.90      0.88      0.89      2

### 스케일링

In [16]:
# # 스케일링
# scaler = StandardScaler()

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

### 모델 학습 및 평가

In [17]:
# # 모델 선정
# models = {
#     # "Logistic Regression": LogisticRegression(random_state=random_state),
#     # "Random Forest": RandomForestClassifier(random_state=random_state),
#     "XGBoost": XGBClassifier(random_state=random_state),
#     # "LightGBM" : LGBMClassifier(random_state=random_state)
# }

# # 모델 학습 및 평가
# for name, model in models.items():
#     model.fit(X_train, y_train)    
#     y_pred = model.predict(X_test)
#     y_pred_proba = model.predict_proba(X_test)[:, 1]  # ROC-AUC 계산을 위한 확률값
    
#     # 평가 지표 출력
#     print(f"{name} ==========")
#     print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
#     print(f"Precision : {precision_score(y_test, y_pred):.4f}")
#     print(f"Recall : {recall_score(y_test, y_pred):.4f}")
#     print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")
#     print(f"ROC-AUC : {roc_auc_score(y_test, y_pred_proba):.4f}")
#     print(f"\n>>>> Classification Report\n{classification_report(y_test, y_pred)}")

#     # 특성 중요도 확인
#     # if (name == 'Logistic Regression'):
#     #     coef_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_[0]})
#     #     print("\n>>>> Feature Coefficients\n", coef_importance.sort_values(by='Coefficient', ascending=False))
#     # else:
#     feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
#     print("\n>>>> Feature Importance\n", feature_importance.sort_values(by='Importance', ascending=False))

#     print("\n" + "="*100 + "\n")

- ### 오버 샘플링 및 하이퍼 파라미터 튜닝

In [18]:
# # 오버샘플링 적용 (평균기준)
# # 복제, 생성, 
# smote = RandomOverSampler(random_state=random_state)
# X_train_resample, y_train_resample = smote.fit_resample(X_train, y_train)

# # 모델 선정
# models = {
#     # "Logistic Regression": LogisticRegression(random_state=random_state),
#     # "Random Forest": RandomForestClassifier(random_state=random_state),
#     "XGBoost": XGBClassifier(random_state=random_state),
#     # "LightGBM" : LGBMClassifier(random_state=random_state)
# }

# # 파라미터 설정
# param_grids = {
#     # "Logistic Regression": {
#     #     'C': [0.01, 0.1, 1, 10],              # 규제 강도
#     #     'penalty': ['l1', 'l2'],               # 규제 유형
#     #     'solver': ['liblinear']                # l1과 l2 모두 지원하는 solver
#     # },
#     "Random Forest": {
#         'n_estimators': [50, 100, 200],                 # 트리 개수
#         'max_depth': [3, 5],                          # 최대 깊이
#         'min_samples_split': [2, 5, 10],                # 노드 분할 최소 샘플 : 값이 클수록 트리가 덜 복잡해져 과적합을 줄이는 효과
#         'min_samples_leaf': [1, 2, 4],                  # 리프 노드 최소 샘플 : 값이 크면 모델이 단순 (클래스 불균형이 심하면 크게 설정)
#         'max_features': ['sqrt', 'log2', 0.3, 0.5],     # 특성 샘플링 비율 : 각 트리에서 사용할 특성의 최대 개수 (무작위성을 높여 모델의 다양성을 증가)
#         'class_weight' : ['balanced']                   # 클래스 가중치 : 클래스 불균형을 해결하기 위해 클래스에 가중치를 부여
#     },
#     "XGBoost": {
#         'n_estimators': [50, 100, 200],                 # 트리 개수
#         'max_depth': [1, 3, 5, 10],                            # 최대 깊이 : XGBoost는 깊이가 얕아도 잘 작동한다!
#         'learning_rate': [0.01, 0.05, 0.1, 0.3],        # 학습률
#         'subsample': [0.6, 0.8, 1.0],                   # 각 트리 학습에 사용할 데이터 샘플 비율 : 값이 낮을수록 과적합 방지
#     },
#     "LightGBM": {
#         'n_estimators': [50, 100, 200],
#         'max_depth': [2, 5, 10],
#         'learning_rate': [0.01, 0.05, 0.1, 0.3],
#         'num_leaves': [20, 31, 50],                     # 한 트리의 최대 리프 노드 수 : 2^(max_depth)보다 작아야 과적합을 줄이는 데 유리
#         'reg_lambda' : [0.1, 1.0]                       # L2 규제 : 과적합을 방지하고 모델을 안정화
#     },
# }

# # 모델 학습 및 평가
# for name, model in models.items():
#     # 파라미터 학습
#     print(f"\nGridSearchCV Search Best Params for {name}..............................")
#     grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='f1', n_jobs=-1, verbose=1)
#     grid_search.fit(X_train_resample, y_train_resample)
    
#     # 최적 모델 선정
#     best_model = grid_search.best_estimator_    
#     print(f">>>> Best Parameters for {name}\n{grid_search.best_params_}")
    
#     # 교차 검증
#     print(f"Cross Val Score : {cross_val_score(best_model, X_train_resample, y_train_resample, scoring='f1', cv=5)}")

#     # 예측
#     y_pred = best_model.predict(X_test)
#     y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # ROC-AUC 계산을 위한 확률값
    
#     # 평가 지표 출력
#     print(f"{name} ==========")
#     print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
#     print(f"Precision : {precision_score(y_test, y_pred):.4f}")
#     print(f"Recall : {recall_score(y_test, y_pred):.4f}")
#     print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")
#     print(f"ROC-AUC : {roc_auc_score(y_test, y_pred_proba):.4f}")
#     print(f"\n>>>> Classification Report\n{classification_report(y_test, y_pred)}")

#     # 특성 중요도 확인
#     # if (name == "Logistic Regression"):
#     #     coef_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': best_model.coef_[0]})
#     #     print("\n>>>> Feature Coefficients\n", coef_importance.sort_values(by='Coefficient', ascending=False))
#     # else:
#     feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_model.feature_importances_})
#     print("\n>>>> Feature Importance\n", feature_importance.sort_values(by='Importance', ascending=False))

#     print("\n" + "="*100 + "\n")

### 최종 모델 선정 및 저장

In [19]:
# dump(model, "model.joblib")