1. No smote, no convert

In [39]:
import pandas as pd
import numpy as np

In [40]:
data_nsnc = pd.read_csv('data/no_SMOTE_no_convert.csv')
data_nsyc = pd.read_csv('data/no_SMOTE_yes_convert.csv')
data_ysnc = pd.read_csv('data/yes_SMOTENC_no_convert.csv')
data_ysyc = pd.read_csv('data/yes_SMOTENC_yes_convert.csv')

In [41]:
from sklearn.metrics import classification_report

def evaluate_model(model, X_test, y_test, threshold=0.5):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    print(f"Classification report (threshold={threshold}):")
    print(classification_report(y_test, y_pred))

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier, plot_importance

models = [

    ('XGBoost', XGBClassifier(), {
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5, 10]
    })
]

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier


y = data_nsnc['is_churned']
X = data_nsnc.drop(columns=['is_churned'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', tree_method='hist', n_jobs=-1, random_state=0
)
xgb_clf.fit(X_train, y_train)


from sklearn.metrics import precision_score, recall_score, f1_score, r2_score

# 예측
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)


print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

0.9390283400809717
0.8581606217616581
[[ 278  323]
 [ 115 2372]]
              precision    recall  f1-score   support

           0       0.71      0.46      0.56       601
           1       0.88      0.95      0.92      2487

    accuracy                           0.86      3088
   macro avg       0.79      0.71      0.74      3088
weighted avg       0.85      0.86      0.85      3088



1. gridsearchcv O

In [44]:
for name, model, params in models:
    print(f"Training {name} ...NSNC")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    
    for thr in [0.3, 0.5, 0.7]:
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training XGBoost ...NSNC
Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.74      0.29      0.42       601
           1       0.85      0.98      0.91      2487

    accuracy                           0.84      3088
   macro avg       0.80      0.63      0.66      3088
weighted avg       0.83      0.84      0.81      3088

Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.69      0.45      0.54       601
           1       0.88      0.95      0.91      2487

    accuracy                           0.85      3088
   macro avg       0.78      0.70      0.73      3088
weighted avg       0.84      0.85      0.84      3088

Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.56      0.63      0.59       601
           1       0.91      0.88      0.89      2487

    accuracy                           0

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


-----------
2. no smote yes convert

In [45]:
y = data_nsyc['is_churned']
X = data_nsyc.drop(columns=['is_churned'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', tree_method='hist', n_jobs=-1, random_state=0
)
xgb_clf.fit(X_train, y_train)


from sklearn.metrics import precision_score, recall_score, f1_score, r2_score

# 예측
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)


print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

0.9446963562753037
0.8520077720207254
[[ 269  332]
 [ 125 2362]]
              precision    recall  f1-score   support

           0       0.68      0.45      0.54       601
           1       0.88      0.95      0.91      2487

    accuracy                           0.85      3088
   macro avg       0.78      0.70      0.73      3088
weighted avg       0.84      0.85      0.84      3088



- GridSearchCV O

In [46]:
for name, model, params in models:
    print(f"Training {name} ...NSYC")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    
    for thr in [0.3, 0.5, 0.7]:
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training XGBoost ...NSYC
Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.75      0.26      0.39       601
           1       0.85      0.98      0.91      2487

    accuracy                           0.84      3088
   macro avg       0.80      0.62      0.65      3088
weighted avg       0.83      0.84      0.81      3088

Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.66      0.44      0.53       601
           1       0.88      0.95      0.91      2487

    accuracy                           0.85      3088
   macro avg       0.77      0.69      0.72      3088
weighted avg       0.83      0.85      0.84      3088

Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.54      0.62      0.58       601
           1       0.90      0.87      0.89      2487

    accuracy                           0

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


-----
3. yes_SMOTENC_no_convert

In [47]:
y = data_ysnc['is_churned']
X = data_ysnc.drop(columns=['is_churned'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', tree_method='hist', n_jobs=-1, random_state=0
)
xgb_clf.fit(X_train, y_train)


from sklearn.metrics import precision_score, recall_score, f1_score, r2_score

# 예측
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)


print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

0.9269629033879562
0.86389223964616
[[2206  281]
 [ 396 2091]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2487
           1       0.88      0.84      0.86      2487

    accuracy                           0.86      4974
   macro avg       0.86      0.86      0.86      4974
weighted avg       0.86      0.86      0.86      4974



- GridSearchCV O

In [48]:
for name, model, params in models:
    print(f"Training {name} ...YSNC")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    
    for thr in [0.3, 0.5, 0.7]:
        evaluate_model(best_model, X_test, y_test, threshold=thr)

Training XGBoost ...YSNC


Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      2487
           1       0.86      0.90      0.88      2487

    accuracy                           0.88      4974
   macro avg       0.88      0.88      0.88      4974
weighted avg       0.88      0.88      0.88      4974

Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      2487
           1       0.90      0.87      0.88      2487

    accuracy                           0.88      4974
   macro avg       0.88      0.88      0.88      4974
weighted avg       0.88      0.88      0.88      4974

Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      2487
           1       0.92      0.82      0.87      2487

    accuracy                           0.88      4974
   macro av

------
4. yes smotenc yes convert

In [49]:
y = data_ysyc['is_churned']
X = data_ysyc.drop(columns=['is_churned'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', tree_method='hist', n_jobs=-1, random_state=0
)
xgb_clf.fit(X_train, y_train)


from sklearn.metrics import precision_score, recall_score, f1_score, r2_score

# 예측
y_pred_train = xgb_clf.predict(X_train)
y_pred_test = xgb_clf.predict(X_test)


print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

0.9312858148185382
0.864897466827503
[[2179  308]
 [ 364 2123]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2487
           1       0.87      0.85      0.86      2487

    accuracy                           0.86      4974
   macro avg       0.87      0.86      0.86      4974
weighted avg       0.87      0.86      0.86      4974



In [50]:
for name, model, params in models:
    print(f"Training {name} ... YSYC")
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1, return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    
    for thr in [0.3, 0.5, 0.7]:
        evaluate_model(best_model, X_test, y_test, threshold=thr)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

Training XGBoost ... YSYC


Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification report (threshold=0.3):
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      2487
           1       0.85      0.90      0.88      2487

    accuracy                           0.87      4974
   macro avg       0.87      0.87      0.87      4974
weighted avg       0.87      0.87      0.87      4974

Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      2487
           1       0.89      0.86      0.87      2487

    accuracy                           0.88      4974
   macro avg       0.88      0.88      0.88      4974
weighted avg       0.88      0.88      0.88      4974

Classification report (threshold=0.7):
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      2487
           1       0.91      0.81      0.86      2487

    accuracy                           0.87      4974
   macro av