In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/Preprocessed_Bank_Customer_Churn_Prediction.csv')

# # 사용할 피처 선택 (customer_id 제거)
features = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary']

# def country_encoding(country):
#     if country == 0:
#         return 'France'
#     elif country == 1:
#         return 'Germany'
#     else:
#         return 'Spain'
    
# df['country'] = df['country'].apply(country_encoding)

In [41]:
from sklearn.metrics import roc_curve, auc

def auc_plot(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.4f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [42]:
# 'balance'와 'estimated_salary' 컬럼을 skewed_columns 리스트에 추가
skewed_columns = ['age', 'balance', 'estimated_salary', 'credit_score']  # 'balance'와 'estimated_salary' 추가
# skewed_columns = ['balance', 'estimated_salary', 'credit_score']  # 'balance'와 'estimated_salary' 추가
# skewed_columns = ['age', 'balance', 'estimated_salary']  # 'balance'와 'estimated_salary' 추가
skewed_columns = ['age']  # 'balance'와 'estimated_salary' 추가

# log1p 변환 함수
def log1p_transform_columns(df, columns):
    for column in columns:
        df[column] = np.log1p(df[column])  # np.log1p는 log(x+1)
    return df

# log1p 변환 수행
df = log1p_transform_columns(df, skewed_columns)

df.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,0,1,3.7612,2,0.0,1,1,1,101348.88,1
1,608,2,1,3.73767,1,83807.86,1,0,1,112542.58,0
2,502,0,1,3.7612,8,159660.8,3,1,0,113931.57,1
3,699,0,1,3.688879,1,0.0,2,0,0,93826.63,0
4,850,2,1,3.78419,2,125510.82,1,1,1,79084.1,0


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 정규화 (Normalization) - 0과 1 사이로 변환
scaler = MinMaxScaler()

X = df[features]
y = df['churn']

X.iloc[:, :-1] = scaler.fit_transform(X.iloc[:, :-1])

print(X.head())

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

   credit_score       age  tenure   balance  products_number  credit_card  \
0         0.538  0.514281     0.2  0.000000         0.000000            1   
1         0.516  0.499465     0.1  0.334031         0.000000            0   
2         0.304  0.514281     0.8  0.636357         0.666667            1   
3         0.698  0.468744     0.1  0.000000         0.333333            0   
4         1.000  0.528757     0.2  0.500246         0.000000            1   

   active_member  estimated_salary  
0              1         101348.88  
1              1         112542.58  
2              0         113931.57  
3              0          93826.63  
4              1          79084.10  


  X.iloc[:, :-1] = scaler.fit_transform(X.iloc[:, :-1])
  X.iloc[:, :-1] = scaler.fit_transform(X.iloc[:, :-1])
  X.iloc[:, :-1] = scaler.fit_transform(X.iloc[:, :-1])


In [57]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# 앙상블 적용
from sklearn.ensemble import StackingClassifier
ada_best_params = {}
ada = AdaBoostClassifier(random_state=42, **ada_best_params)
# VotingClassifier 정의
cat_best_params = {'depth': 4, 'iterations': 100, 'learning_rate': 0.1}
cat = CatBoostClassifier(random_state=42, **cat_best_params)
# 최적 Recall 하이퍼파라미터 설정
xgb_best_params = {
    'colsample_bytree': 0.998835926756326,
    'learning_rate': 0.05918736154295684,
    'max_depth': 4,
    'min_child_weight': 4,
    'n_estimators': 152,
    'subsample': 0.7171956369176462,
    'scale_pos_weight': 1.9,  # Recall 최적화된 비율
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
}
xgb = XGBClassifier(random_state=42, **xgb_best_params)
rf_best_params = {'n_estimators':200, 'max_depth':15, 'min_samples_leaf':2, 'min_samples_split':5}
rf = RandomForestClassifier(random_state=42, **rf_best_params)

estimators = [
    ('cat', cat),
    ('xgb', xgb),
    ('rf', rf)
]

stacking = StackingClassifier(estimators=estimators, final_estimator=CatBoostClassifier(random_state=42, **cat_best_params))

stacking.fit(X_train, y_train)

0:	learn: 0.6464634	total: 2.3ms	remaining: 228ms
1:	learn: 0.6053348	total: 4.28ms	remaining: 210ms
2:	learn: 0.5745701	total: 5.37ms	remaining: 174ms
3:	learn: 0.5467781	total: 6.34ms	remaining: 152ms
4:	learn: 0.5238689	total: 7.22ms	remaining: 137ms
5:	learn: 0.5037690	total: 8.07ms	remaining: 126ms
6:	learn: 0.4861531	total: 9.6ms	remaining: 128ms
7:	learn: 0.4710029	total: 10.9ms	remaining: 125ms
8:	learn: 0.4580213	total: 12.2ms	remaining: 123ms
9:	learn: 0.4468045	total: 64.9ms	remaining: 584ms
10:	learn: 0.4368222	total: 66.6ms	remaining: 538ms
11:	learn: 0.4281312	total: 67.5ms	remaining: 495ms
12:	learn: 0.4208029	total: 80.6ms	remaining: 540ms
13:	learn: 0.4155846	total: 82.1ms	remaining: 505ms
14:	learn: 0.4089106	total: 83.4ms	remaining: 473ms
15:	learn: 0.4035765	total: 96ms	remaining: 504ms
16:	learn: 0.3992297	total: 99ms	remaining: 483ms
17:	learn: 0.3945264	total: 100ms	remaining: 457ms
18:	learn: 0.3909941	total: 102ms	remaining: 433ms
19:	learn: 0.3877919	total: 10

In [58]:
from sklearn.metrics import classification_report

y_pred = stacking.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8799    0.9627    0.9194      2389
           1     0.7694    0.4861    0.5958       611

    accuracy                         0.8657      3000
   macro avg     0.8247    0.7244    0.7576      3000
weighted avg     0.8574    0.8657    0.8535      3000

