In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [49]:
data_nsnc = pd.read_csv('data/no_SMOTE_no_convert.csv')
data_nsyc = pd.read_csv('data/no_SMOTE_yes_convert.csv')
data_ysnc = pd.read_csv('data/yes_SMOTENC_no_convert.csv')
data_ysyc = pd.read_csv('data/yes_SMOTENC_yes_convert.csv')

In [50]:
from sklearn.metrics import classification_report

def evaluate_model(model, X_test, y_test, threshold=0.5):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    print(f"Classification report (threshold={threshold}):")
    print(classification_report(y_test, y_pred))

In [51]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier, plot_importance

models = [

    ('Decision Tree', DecisionTreeClassifier(), {
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5, 10]
    })
]

### 1. NSNC

In [52]:
X = data_nsnc.drop('is_churned', axis=1)
y = data_nsnc['is_churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [53]:
for name, model, params in models:
    print(f"Training {name} ...")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    
    for thr in [0.3, 0.5, 0.7]:
        print('Decision Tree')
        print('No Smote No Convert')
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training Decision Tree ...
Decision Tree
No Smote No Convert
Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.75      0.26      0.39       601
           1       0.85      0.98      0.91      2487

    accuracy                           0.84      3088
   macro avg       0.80      0.62      0.65      3088
weighted avg       0.83      0.84      0.81      3088

Decision Tree
No Smote No Convert
Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.68      0.41      0.51       601
           1       0.87      0.95      0.91      2487

    accuracy                           0.85      3088
   macro avg       0.77      0.68      0.71      3088
weighted avg       0.83      0.85      0.83      3088

Decision Tree
No Smote No Convert
Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.56      0.54      0.55  

### 2. NSYC

In [54]:
X = data_nsyc.drop('is_churned', axis=1)
y = data_nsyc['is_churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [55]:
for name, model, params in models:
    print(f"Training {name} ...")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    for thr in [0.3, 0.5, 0.7]:
        print('Decision Tree')
        print('No Smote Yes Convert')
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training Decision Tree ...
Decision Tree
No Smote Yes Convert
Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.71      0.26      0.38       601
           1       0.84      0.97      0.91      2487

    accuracy                           0.84      3088
   macro avg       0.78      0.62      0.64      3088
weighted avg       0.82      0.84      0.80      3088

Decision Tree
No Smote Yes Convert
Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.66      0.39      0.49       601
           1       0.87      0.95      0.91      2487

    accuracy                           0.84      3088
   macro avg       0.76      0.67      0.70      3088
weighted avg       0.82      0.84      0.82      3088

Decision Tree
No Smote Yes Convert
Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.54      0.57      0.5

### 3. YSNC

In [56]:
X = data_ysnc.drop('is_churned', axis=1)
y = data_ysnc['is_churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [57]:
for name, model, params in models:
    print(f"Training {name} ...")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    for thr in [0.3, 0.5, 0.7]:
        print('Decision Tree')
        print('Yes Smote No Convert')
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training Decision Tree ...
Decision Tree
Yes Smote No Convert
Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      2487
           1       0.83      0.81      0.82      2487

    accuracy                           0.82      4974
   macro avg       0.82      0.82      0.82      4974
weighted avg       0.82      0.82      0.82      4974

Decision Tree
Yes Smote No Convert
Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      2487
           1       0.83      0.81      0.82      2487

    accuracy                           0.82      4974
   macro avg       0.82      0.82      0.82      4974
weighted avg       0.82      0.82      0.82      4974

Decision Tree
Yes Smote No Convert
Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.81      0.84      0.8

### 4. YSYC

In [58]:
X = data_ysyc.drop('is_churned', axis=1)
y = data_ysyc['is_churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [59]:
for name, model, params in models:
    print(f"Training {name} ...")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    for thr in [0.3, 0.5, 0.7]:
        print('Decision Tree')
        print('Yes Smote Yes Convert')
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training Decision Tree ...
Decision Tree
Yes Smote Yes Convert
Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.86      0.69      0.76      2487
           1       0.74      0.89      0.81      2487

    accuracy                           0.79      4974
   macro avg       0.80      0.79      0.79      4974
weighted avg       0.80      0.79      0.79      4974

Decision Tree
Yes Smote Yes Convert
Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.81      0.82      0.81      2487
           1       0.82      0.80      0.81      2487

    accuracy                           0.81      4974
   macro avg       0.81      0.81      0.81      4974
weighted avg       0.81      0.81      0.81      4974

Decision Tree
Yes Smote Yes Convert
Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.75      0.90      

In [None]:
# # 변수 중요도 분석 (permutation_importance)
# print("\nPermutation Importance on Test Set:")
# perm_importance = permutation_importance(best_model_result['Best Estimator'], X_test, y_test, n_repeats=30, random_state=42, n_jobs=-1)

# perm_df = pd.DataFrame({'feature': X_test.columns, 'importance_mean': perm_importance.importances_mean})
# perm_df = perm_df.sort_values(by='importance_mean', ascending=False)

# print(perm_df.head(15))