# Random Forest 모델 테스트

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

---

## SMOTENC 적용 X

In [None]:
no_SMOTENC_yes_convert_df = pd.read_csv('./data_ml/no_SMOTENC.csv')

# 데이터 분리
X2 = no_SMOTENC_yes_convert_df.drop('is_churned', axis=1)
y2 = no_SMOTENC_yes_convert_df['is_churned']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, stratify=y2, random_state=42)

# 모델 생성
rf_clf2 = RandomForestClassifier(random_state=42)

params2 = {
    'n_estimators': range(100, 601, 100),
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],   # 경고 시 None -> 1.0
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

#grid_search1 = GridSearchCV(rf_clf1, param_grid=params1, cv=5, scoring='f1', n_jobs=-1, verbose=1,return_train_score=True)

random_search2 = RandomizedSearchCV(
    estimator=rf_clf2,
    param_distributions=params2,
    n_iter=100,              # 예: 100개 샘플링
    scoring='f1',            # 필요 시 'roc_auc'도 시도
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

random_search2.fit(X2_train, y2_train)

print("Best params:", random_search2.best_params_)
print("Best CV F1:", random_search2.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'n_estimators': 300, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 10, 'class_weight': None, 'bootstrap': True}
Best CV F1: 0.9099169681029972


In [15]:
# 테스트 평가
best_rf2 = random_search2.best_estimator_
y_pred2 = best_rf2.predict(X2_test)
y_proba2 = best_rf2.predict_proba(X2_test)[:, 1]

print("\nTest classification report:\n", classification_report(y2_test, y_pred2))
print("Test ROC-AUC:", roc_auc_score(y2_test, y_proba2))


Test classification report:
               precision    recall  f1-score   support

           0       0.71      0.42      0.53       751
           1       0.87      0.96      0.91      3109

    accuracy                           0.85      3860
   macro avg       0.79      0.69      0.72      3860
weighted avg       0.84      0.85      0.84      3860

Test ROC-AUC: 0.8544725827127033


---
---

## 1-4. SMOTENC 적용 O

In [None]:
yes_SMOTENC_yes_convert_df = pd.read_csv('./data_ml/yes_SMOTENC.csv')

# 데이터 분리
X4 = yes_SMOTENC_yes_convert_df.drop('is_churned', axis=1)
y4 = yes_SMOTENC_yes_convert_df['is_churned']

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, stratify=y4, random_state=42)

# 모델 생성
rf_clf4 = RandomForestClassifier(random_state=42)

params4 = {
    'n_estimators': range(100, 601, 100),
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],   # 경고 시 None -> 1.0
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}


random_search4 = RandomizedSearchCV(
    estimator=rf_clf4,
    param_distributions=params4,
    n_iter=100,              # 예: 100개 샘플링
    scoring='f1',            # 필요 시 'roc_auc'도 시도
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=True
)

random_search4.fit(X4_train, y4_train)

print("Best params:", random_search4.best_params_)
print("Best CV F1:", random_search4.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'class_weight': None, 'bootstrap': False}
Best CV F1: 0.8595365346443499


In [19]:
# 테스트 평가
best_rf4 = random_search4.best_estimator_
y_pred4 = best_rf4.predict(X4_test)
y_proba4 = best_rf4.predict_proba(X4_test)[:, 1]

print("\nTest classification report:\n", classification_report(y4_test, y_pred4))
print("Test ROC-AUC:", roc_auc_score(y4_test, y_proba4))


Test classification report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87      3109
           1       0.87      0.87      0.87      3108

    accuracy                           0.87      6217
   macro avg       0.87      0.87      0.87      6217
weighted avg       0.87      0.87      0.87      6217

Test ROC-AUC: 0.94346368723178
