# Random Forest 모델 테스트

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

## 1-1. SMOTENC 적용 X, 소득 & 나이 수치형 변환 X

In [10]:
# 데이터 로드
no_SMOTENC_no_convert_df = pd.read_csv('no_SMOTE_no_convert.csv')

# 데이터 분리
X1 = no_SMOTENC_no_convert_df.drop('is_churned', axis=1)
y1 = no_SMOTENC_no_convert_df['is_churned']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, stratify=y1, random_state=42)

# 모델 생성
rf_clf1 = RandomForestClassifier(random_state=42)

params1 = {
    'n_estimators': range(100, 601, 100),
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],   # 경고 시 None -> 1.0
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

#grid_search1 = GridSearchCV(rf_clf1, param_grid=params1, cv=5, scoring='f1', n_jobs=-1, verbose=1,return_train_score=True)

random_search1 = RandomizedSearchCV(
    estimator=rf_clf1,
    param_distributions=params1,
    n_iter=100,              # 예: 100개 샘플링
    scoring='f1',            # 필요 시 'roc_auc'도 시도
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

random_search1.fit(X1_train, y1_train)

print("Best params:", random_search1.best_params_)
print("Best CV F1:", random_search1.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'n_estimators': 300, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 10, 'class_weight': None, 'bootstrap': True}
Best CV F1: 0.9097745808153261


In [11]:
# 테스트 평가
best_rf = random_search1.best_estimator_
y_pred = best_rf.predict(X1_test)
y_proba = best_rf.predict_proba(X1_test)[:, 1]

print("\nTest classification report:\n", classification_report(y1_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y1_test, y_proba))


Test classification report:
               precision    recall  f1-score   support

           0       0.70      0.42      0.52       751
           1       0.87      0.96      0.91      3109

    accuracy                           0.85      3860
   macro avg       0.79      0.69      0.72      3860
weighted avg       0.84      0.85      0.84      3860

Test ROC-AUC: 0.8567489514356114


---

## 1-2. SMOTENC 적용 X, 소득 & 나이 수치형 변환 O

In [14]:
no_SMOTENC_yes_convert_df = pd.read_csv('no_SMOTE_yes_convert.csv')

# 데이터 분리
X2 = no_SMOTENC_yes_convert_df.drop('is_churned', axis=1)
y2 = no_SMOTENC_yes_convert_df['is_churned']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, stratify=y2, random_state=42)

# 모델 생성
rf_clf2 = RandomForestClassifier(random_state=42)

params2 = {
    'n_estimators': range(100, 601, 100),
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],   # 경고 시 None -> 1.0
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

#grid_search1 = GridSearchCV(rf_clf1, param_grid=params1, cv=5, scoring='f1', n_jobs=-1, verbose=1,return_train_score=True)

random_search2 = RandomizedSearchCV(
    estimator=rf_clf2,
    param_distributions=params2,
    n_iter=100,              # 예: 100개 샘플링
    scoring='f1',            # 필요 시 'roc_auc'도 시도
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

random_search2.fit(X2_train, y2_train)

print("Best params:", random_search2.best_params_)
print("Best CV F1:", random_search2.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'n_estimators': 300, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 10, 'class_weight': None, 'bootstrap': True}
Best CV F1: 0.9099169681029972


In [15]:
# 테스트 평가
best_rf2 = random_search2.best_estimator_
y_pred2 = best_rf2.predict(X2_test)
y_proba2 = best_rf2.predict_proba(X2_test)[:, 1]

print("\nTest classification report:\n", classification_report(y2_test, y_pred2))
print("Test ROC-AUC:", roc_auc_score(y2_test, y_proba2))


Test classification report:
               precision    recall  f1-score   support

           0       0.71      0.42      0.53       751
           1       0.87      0.96      0.91      3109

    accuracy                           0.85      3860
   macro avg       0.79      0.69      0.72      3860
weighted avg       0.84      0.85      0.84      3860

Test ROC-AUC: 0.8544725827127033


---
---

## 1-3. SMOTENC 적용 O, 소득 & 나이 수치형 변환 X

In [16]:
yes_SMOTENC_no_convert_df = pd.read_csv('yes_SMOTENC_no_convert.csv')

# 데이터 분리
X3 = yes_SMOTENC_no_convert_df.drop('is_churned', axis=1)
y3 = yes_SMOTENC_no_convert_df['is_churned']

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, stratify=y3, random_state=42)

# 모델 생성
rf_clf3 = RandomForestClassifier(random_state=42)

params3 = {
    'n_estimators': range(100, 601, 100),
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],   # 경고 시 None -> 1.0
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}


random_search3 = RandomizedSearchCV(
    estimator=rf_clf3,
    param_distributions=params3,
    n_iter=100,              # 예: 100개 샘플링
    scoring='f1',            # 필요 시 'roc_auc'도 시도
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

random_search3.fit(X3_train, y3_train)

print("Best params:", random_search3.best_params_)
print("Best CV F1:", random_search3.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'class_weight': None, 'bootstrap': False}
Best CV F1: 0.8748531154139547


In [17]:
# 테스트 평가
best_rf3 = random_search3.best_estimator_
y_pred3 = best_rf3.predict(X3_test)
y_proba3 = best_rf3.predict_proba(X3_test)[:, 1]

print("\nTest classification report:\n", classification_report(y3_test, y_pred3))
print("Test ROC-AUC:", roc_auc_score(y3_test, y_proba3))


Test classification report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.89      3109
           1       0.90      0.88      0.89      3108

    accuracy                           0.89      6217
   macro avg       0.89      0.89      0.89      6217
weighted avg       0.89      0.89      0.89      6217

Test ROC-AUC: 0.9598237959045292


---

## 1-4. SMOTENC 적용 O, 소득 & 나이 수치형 변환 O

In [18]:
yes_SMOTENC_yes_convert_df = pd.read_csv('yes_SMOTENC_yes_convert.csv')

# 데이터 분리
X4 = yes_SMOTENC_yes_convert_df.drop('is_churned', axis=1)
y4 = yes_SMOTENC_yes_convert_df['is_churned']

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, stratify=y4, random_state=42)

# 모델 생성
rf_clf4 = RandomForestClassifier(random_state=42)

params4 = {
    'n_estimators': range(100, 601, 100),
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],   # 경고 시 None -> 1.0
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}


random_search4 = RandomizedSearchCV(
    estimator=rf_clf4,
    param_distributions=params4,
    n_iter=100,              # 예: 100개 샘플링
    scoring='f1',            # 필요 시 'roc_auc'도 시도
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

random_search4.fit(X4_train, y4_train)

print("Best params:", random_search4.best_params_)
print("Best CV F1:", random_search4.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'class_weight': None, 'bootstrap': False}
Best CV F1: 0.8595365346443499


In [19]:
# 테스트 평가
best_rf4 = random_search4.best_estimator_
y_pred4 = best_rf4.predict(X4_test)
y_proba4 = best_rf4.predict_proba(X4_test)[:, 1]

print("\nTest classification report:\n", classification_report(y4_test, y_pred4))
print("Test ROC-AUC:", roc_auc_score(y4_test, y_proba4))


Test classification report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87      3109
           1       0.87      0.87      0.87      3108

    accuracy                           0.87      6217
   macro avg       0.87      0.87      0.87      6217
weighted avg       0.87      0.87      0.87      6217

Test ROC-AUC: 0.94346368723178
