In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data/Preprocessed_Bank_Customer_Churn_Prediction.csv')

# 사용할 피처 선택 (customer_id 제거)
features = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 
            'credit_card', 'active_member', 'estimated_salary']

X = df[features]
y = df['churn']

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

In [4]:
from sklearn.metrics import classification_report

# 테스트 데이터로 예측
y_pred = clf.predict(X_test)

# 정확도 출력
print('리포트 :\n', classification_report(y_test, y_pred, digits=4))

리포트 :
               precision    recall  f1-score   support

           0     0.8648    0.8349    0.8496      1593
           1     0.4307    0.4889    0.4580       407

    accuracy                         0.7645      2000
   macro avg     0.6477    0.6619    0.6538      2000
weighted avg     0.7764    0.7645    0.7699      2000



In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': range(1, 11),
    # 'criterion': ['gini', 'entropy'],
    # 'splitter': ['best', 'random'],
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(1, 11),
}

grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

print('최적의 하이퍼파라미터 :', grid_search.best_params_)

최적의 하이퍼파라미터 : {'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 2}


In [6]:
from sklearn.metrics import classification_report
# 최적의 하이퍼파라미터로 학습
clf = grid_search.best_estimator_

# 테스트 데이터로 예측
y_pred = clf.predict(X_test)

# 정확도 출력
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8681    0.9705    0.9164      1593
           1     0.7854    0.4226    0.5495       407

    accuracy                         0.8590      2000
   macro avg     0.8267    0.6966    0.7330      2000
weighted avg     0.8512    0.8590    0.8418      2000



In [7]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import classification_report

# 테스트 데이터로 예측
y_pred = model.predict(X_test)

# 정확도 출력
print('리포트 :\n', classification_report(y_test, y_pred, digits=4))

리포트 :
               precision    recall  f1-score   support

           0     0.8682    0.9636    0.9134      1593
           1     0.7500    0.4275    0.5446       407

    accuracy                         0.8545      2000
   macro avg     0.8091    0.6956    0.7290      2000
weighted avg     0.8442    0.8545    0.8384      2000



In [9]:
from sklearn.model_selection import GridSearchCV

# 그리드 탐색을 위한 파라미터 설정

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
}

# 랜덤 포레스트 모델 생성

rf = RandomForestClassifier(random_state=42)

# 그리드 탐색 수행

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력

print('최적의 파라미터 :', grid_search.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf

In [10]:
model = grid_search.best_estimator_

y_pred = model.predict(X_test)

# 피처 중요도 출력
feature_importances = model.feature_importances_

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8679    0.9692    0.9158      1593
           1     0.7783    0.4226    0.5478       407

    accuracy                         0.8580      2000
   macro avg     0.8231    0.6959    0.7318      2000
weighted avg     0.8497    0.8580    0.8409      2000



In [13]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [15]:
# imbalanced-learn 패키지
from imblearn.over_sampling import SMOTE

# 검증 데이터나 테스트 데이터가 아닌 학습데이터에서만 오버샘플링 사용할 것
smote = SMOTE(random_state=42)

# 오버샘플링 적용
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [None]:
dt = DecisionTreeClassifier(random_state=42)

# 학습
dt.fit(X_train_over, y_train_over)

# 예측
y_pred = dt.predict(X_test)

# 정확도 출력
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8748    0.7458    0.8052      1593
           1     0.3692    0.5823    0.4519       407

    accuracy                         0.7125      2000
   macro avg     0.6220    0.6640    0.6285      2000
weighted avg     0.7719    0.7125    0.7333      2000



In [19]:
rf = RandomForestClassifier(random_state=42, max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=200)

# 학습
rf.fit(X_train_over, y_train_over)

# 예측
y_pred = rf.predict(X_test)

# 정확도 출력
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9013    0.8255    0.8617      1593
           1     0.4861    0.6462    0.5549       407

    accuracy                         0.7890      2000
   macro avg     0.6937    0.7358    0.7083      2000
weighted avg     0.8168    0.7890    0.7993      2000



In [23]:
# 클래스 가중치 설정

# 클래스 불균형이 심할 경우 클래스 가중치를 설정하여 학습

# 클래스 불균형이 심한 경우 클래스 가중치 설정
class_weight = {
    0: 1,
    1: 4
}

# 랜덤 포레스트 모델 생성

rf = RandomForestClassifier(random_state=42, max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=200, class_weight=class_weight)

rf.fit(X_train, y_train)

# 예측
y_pred = rf.predict(X_test)

# 정확도 출력
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8836    0.9146    0.8988      1593
           1     0.6125    0.5283    0.5673       407

    accuracy                         0.8360      2000
   macro avg     0.7481    0.7214    0.7331      2000
weighted avg     0.8284    0.8360    0.8314      2000



In [None]:
# threshold 조정

# 임계값을 조정하여 클래스의 확률을 기반으로 예측
# 클래스 1의 임계값을 0.3으로 설정
y_pred = (rf.predict_proba(X_test)[:, 1] >= 0.3).astype(int)

# 정확도 출력
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9210    0.7464    0.8245      1593
           1     0.4302    0.7494    0.5466       407

    accuracy                         0.7470      2000
   macro avg     0.6756    0.7479    0.6856      2000
weighted avg     0.8211    0.7470    0.7680      2000

