In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 

df = pd.read_csv('../data/raw/BankChurners.csv')

df = df.copy()
df['Attrition_Binary'] = (df['Attrition_Flag'] == 'Attrited Customer').astype(int)

drop_cols = [col for col in ['CLIENTNUM', 'Attrition_Flag', "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"] if col in df.columns]
df = df.drop(columns=drop_cols)

print(df.shape)

(10127, 20)


In [22]:
# 2. 새로운 Feature 생성

# 월 평균 거래 금액
df["Avg_Transaction_Amount"] = df["Total_Trans_Amt"] / (df["Total_Trans_Ct"] + 1)

# 카드 사용 기간 대비 신용 한도 비율
df["Credit_Utilization"] = df["Credit_Limit"] / (df["Months_on_book"] + 1)

# 비활동 기간 대비 접촉 횟수 (은행의 고객 관리 강도 예측)
# df['Contact_Per_Inactive'] = df['Contacts_Count_12_mon'] / (df['Months_Inactive_12_mon'] + 1)

print(df.shape)

(10127, 22)


In [None]:
# 3. 컬럼 유형 별 분류 및 전처리 파이프 라인 정리 

X = df.drop('Attrition_Binary', axis=1)
y = df['Attrition_Binary']

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist() 

# 1. 수치형 파이프라인 (Scaling)
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())                    
])

# 2. 범주형 파이프라인 (Encoding)
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
])

# 서로 다른 전처리 과정을 동시에 적용하도록 하는 객체
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)

In [None]:
# 4. 전처리 분리 및 전처리 적용
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_processed = preprocessor.fit_transform(X_train) 
X_test_processed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()

X_train_final = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_final = pd.DataFrame(X_test_processed, columns=feature_names)

print("4단계 완료: 전처리 적용 후 Feature 개수:", X_train_final.shape[1])
print("\n전처리된 데이터 Feature 목록 (일부):", X_train_final.columns.tolist()[:10])

4단계 완료: 전처리 적용 후 Feature 개수: 39

전처리된 데이터 Feature 목록 (일부): ['num__Customer_Age', 'num__Dependent_count', 'num__Months_on_book', 'num__Total_Relationship_Count', 'num__Months_Inactive_12_mon', 'num__Contacts_Count_12_mon', 'num__Credit_Limit', 'num__Total_Revolving_Bal', 'num__Avg_Open_To_Buy', 'num__Total_Amt_Chng_Q4_Q1']


In [None]:
from sklearn.svm import SVC

clf = Pipeline([
    ('preprocess', preprocessor),
    ('model', SVC())
])

clf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


# 3. 하이퍼 파라미터 튜닝

In [41]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(random_state=42))
])

rf_params = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [5, 10, 15, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__class_weight': ['balanced', None]
}

rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_params,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_train, y_train)

print("Best RF Params:", rf_grid.best_params_)
print("Best RF Score:", rf_grid.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100; total time=   2.0s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100; total time=   2.1s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100; total time=   2.2s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100; total time=   2.1s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100; total time=   2.1s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=200; total time=   3.8s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=200; total time=   3.8s
[CV] END rf__class_weight=balanced, rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=200; total

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

smote_xgb_model = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42, sampling_strategy=0.8)),
    ('xgb', XGBClassifier(random_state=42, eval_metric='logloss'))
])

xgb_params = {
    'xgb__n_estimators': [100, 200],
    'xgb__learning_rate': [0.05, 0.1],
    'xgb__max_depth': [5, 7],
    'smote__k_neighbors': [3, 5]  
}

xgb_grid = GridSearchCV(
    estimator=smote_xgb_model,
    param_grid=xgb_params,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

xgb_grid.fit(X_train, y_train)

print("Best XGB Params:", xgb_grid.best_params_)
print("Best XGB Score:", xgb_grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=100; total time=   1.7s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=100; total time=   1.7s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=100; total time=   1.7s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=100; total time=   1.8s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=100; total time=   1.8s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=200; total time=   2.6s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=200; total time=   2.6s
[CV] END smote__k_neighbors=3, xgb__learning_rate=0.05, xgb__max_depth=5, xgb__n_estimators=200; total time=   2.5s
[CV] END sm

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

best_model = xgb_grid.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_proba)

print(f"정확도: {acc:.4f}")
print(f"ROC-AUC: {roc:.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred))

정확도: 0.9704
ROC-AUC: 0.9926

분류 리포트:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1701
           1       0.93      0.88      0.91       325

    accuracy                           0.97      2026
   macro avg       0.95      0.93      0.94      2026
weighted avg       0.97      0.97      0.97      2026



모델 성능 향상을 위해 Random Forest와 XGBoost를 대상으로 GridSearchCV와 Stratified K-Fold 기반 하이퍼파라미터 튜닝을 수행.
튜닝 기준 평가는 ROC-AUC를 사용했고, 그 결과 XGBoost가 가장 높은 성능을 보여 최종 모델로 선택.

# 4. ensemble

In [None]:
from sklearn.ensemble import VotingClassifier

best_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)

best_xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=sum(y_train==0)/sum(y_train==1),
    eval_metric="logloss",
    random_state=42
)

voting_model =  VotingClassifier(
        estimators=[
            ('rf', best_rf),
            ('xgb', best_xgb)
        ],
        voting='soft',   
        weights=[1,2]   
    )

voting_model.fit(X_train, y_train)

print("Voting Ensemble Training Done")

Voting Ensemble Training Done


In [None]:
# voting 모델 성능 확인
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

y_pred = voting_model.predict(X_test)
y_proba = voting_model.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_proba)

print(f"정확도: {acc:.4f}")
print(f"ROC-AUC: {roc:.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred))

정확도: 0.9625
ROC-AUC: 0.9918

분류 리포트:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1701
           1       0.86      0.92      0.89       325

    accuracy                           0.96      2026
   macro avg       0.92      0.94      0.93      2026
weighted avg       0.96      0.96      0.96      2026



In [None]:
# stacking classifier 
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacking_model =  StackingClassifier(
        estimators=[
            ('rf', best_rf),
            ('xgb', best_xgb)
        ],
        final_estimator=LogisticRegression(max_iter=1000),
        stack_method='predict_proba',
        cv=5,
        n_jobs=-1
    )

stacking_model.fit(X_train, y_train)

print("Stacking Model Training Done")

Stacking Model Training Done


In [None]:
# stacking 모델 성능 평가
y_pred_stack = stacking_model.predict(X_test)
y_proba_stack = stacking_model.predict_proba(X_test)[:,1]

acc_stack = accuracy_score(y_test, y_pred_stack)
roc_stack = roc_auc_score(y_test, y_proba_stack)

print(f"정확도: {acc_stack:.4f}")
print(f"ROC-AUC: {roc_stack:.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred_stack))

정확도: 0.9719
ROC-AUC: 0.9921

분류 리포트:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1701
           1       0.92      0.91      0.91       325

    accuracy                           0.97      2026
   macro avg       0.95      0.95      0.95      2026
weighted avg       0.97      0.97      0.97      2026



어떤 게 최종 모델?
모델	장점	추천
- Soft Voting	안정적, 과적합 적음	데이터 균형 있으면 좋은 선택
- Stacking (추천)	성능 가장 잘 나오는 경우 많음	불균형 데이터 + 비선형 모델 → Best

“하이퍼파라미터 튜닝된 RandomForest와 XGBoost 모델을 기반으로
Voting과 Stacking 앙상블을 적용했습니다.
Soft Voting은 두 모델의 확률 평균으로 예측하는 방식이고,
Stacking은 두 모델의 예측 결과를 Logistic Regression이 다시 학습하여 최종 판단하는 구조입니다.
평가 결과 Stacking 모델이 가장 높은 ROC-AUC와 Recall 기반 성능을 보여 최종 모델로 선정했습니다.”

In [None]:
import joblib

# 최종 모델 저장
joblib.dump(stacking_model, "final_churn_model.pkl")

print("💾 Model Saved: final_churn_model.pkl")