In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import joblib


In [2]:
# 데이터 로드
df = pd.read_csv('Data/gym_churn_us.csv')
# X, y 분할
X = df.drop(columns=['Churn', 'Phone', 'Month_to_end_contract', 'Avg_class_frequency_current_month'])
y = df['Churn']
print(X.info())
# print(y.info())

# train, valid 분할 
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)
# print(X_trian.shape, X_valid.shape, y_train.shape, y_valid.shape)

# 비율 확인 (이탈 N / Y)
print(np.unique(y, return_counts=True)[1]/y.size)
print(np.unique(y_train, return_counts=True)[1]/y_train.size)
print(np.unique(y_valid, return_counts=True)[1]/y_valid.size)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gender                        4000 non-null   int64  
 1   Near_Location                 4000 non-null   int64  
 2   Partner                       4000 non-null   int64  
 3   Promo_friends                 4000 non-null   int64  
 4   Contract_period               4000 non-null   int64  
 5   Group_visits                  4000 non-null   int64  
 6   Age                           4000 non-null   int64  
 7   Avg_additional_charges_total  4000 non-null   float64
 8   Lifetime                      4000 non-null   int64  
 9   Avg_class_frequency_total     4000 non-null   float64
dtypes: float64(2), int64(8)
memory usage: 312.6 KB
None
[0.73475 0.26525]
[0.73466667 0.26533333]
[0.735 0.265]


In [3]:
# 베이스라인 모델 학습 (Gradient Boosting, RandomForest, KNN, XGBoosting) 

models = [
    ("GB", GradientBoostingClassifier(random_state=0)),
    ("RF", RandomForestClassifier(random_state=0)),
    ("KNN", KNeighborsClassifier()),
    ("XGB", XGBClassifier(random_state=0))
]

baseline_result_valid = {}

for name, model in models:
    model.fit(X_train, y_train)
    
    pred_valid = model.predict(X_valid)
    pred_proba_valid = model.predict_proba(X_valid)[:, 1]

    accuracy_valid = accuracy_score(y_valid, pred_valid)
    precision_valid = precision_score(y_valid, pred_valid)
    recall_valid = recall_score(y_valid, pred_valid)
    f1_valid = f1_score(y_valid, pred_valid)
    roc_auc_valid = roc_auc_score(y_valid, pred_proba_valid)
    
    baseline_result_valid[name] = [accuracy_valid, precision_valid, recall_valid, f1_valid, roc_auc_valid]

pd.DataFrame(baseline_result_valid, index=['accuracy', 'precision', 'recall', 'f1', 'roc-auc'])

Unnamed: 0,GB,RF,KNN,XGB
accuracy,0.899,0.888,0.857,0.888
precision,0.833333,0.804781,0.731061,0.802372
recall,0.773585,0.762264,0.728302,0.766038
f1,0.802348,0.782946,0.729679,0.783784
roc-auc,0.954614,0.942685,0.87636,0.944777


In [4]:
# 모델 튜닝, 학습, 저 함수 (파라미터 수에 따라, Grid Search 또는 Randomized Search 사용)
def cv(t, model, params):
    if t == 'grid':
        cv = GridSearchCV(
            estimator=model,
            param_grid=params,
            scoring='roc_auc',
            cv=4,
            n_jobs=-1
        )
    elif t == 'rand':
        cv = RandomizedSearchCV(
            model, params, 
            cv=4, 
            scoring='roc_auc', 
            n_jobs=-1, 
            n_iter=60, 
            random_state=0
        )

    cv.fit(X_train, y_train)
    print('Best Parameters:', cv.best_params_)
    print('Best Estimator:', cv.best_estimator_)
    print('Best Score:', cv.best_score_)
    
    best_model = cv.best_estimator_
    
    pred_train = best_model.predict(X_train)
    pred_test = best_model.predict(X_valid)
    
    pred_train_proba = best_model.predict_proba(X_train)
    pred_valid_proba = best_model.predict_proba(X_valid)
    
    train_score = roc_auc_score(y_train, pred_train_proba[:,1])
    valid_score = roc_auc_score(y_valid, pred_valid_proba[:,1])
    
    print('Train Score:', train_score)
    print('Valid Score:', valid_score)
    return best_model


In [5]:
# RandomForest Grid Search
model_rf = RandomForestClassifier(random_state=0)

params_rf = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700],
    'max_depth': [2, 3, 4, 5, 6, 7],
}

best_rf = cv('grid', model_rf, params_rf)
joblib.dump(best_rf, 'models/best_rf.pkl')


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': 7, 'n_estimators': 300}
Best Estimator: RandomForestClassifier(max_depth=7, n_estimators=300, random_state=0)
Best Score: 0.9547168692828936
Train Score: 0.9802010278251512
Valid Score: 0.9508150429983315


['models/best_rf.pkl']

In [6]:
# Gradient Boosting Randomized Search

model_gb = GradientBoostingClassifier(random_state=0)

params_gb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': range(1, 6),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

best_gb = cv('rand', model_gb, params_gb)
joblib.dump(best_gb, 'models/best_gb.pkl')


Best Parameters: {'subsample': 1, 'n_estimators': 4000, 'max_depth': 1, 'learning_rate': 0.01}
Best Estimator: GradientBoostingClassifier(learning_rate=0.01, max_depth=1, n_estimators=4000,
                           random_state=0, subsample=1)
Best Score: 0.9591571742560352
Train Score: 0.9665298475134291
Valid Score: 0.9539211911179566


['models/best_gb.pkl']

In [7]:
# XGBoosting Randomized Search

model_xgb = XGBClassifier(random_state=0)

params_xgb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': range(1, 6),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

best_xgb = cv('rand', model_xgb, params_xgb)
joblib.dump(best_xgb, 'models/best_xgb.pkl')


Best Parameters: {'subsample': 1, 'n_estimators': 4000, 'max_depth': 1, 'learning_rate': 0.01}
Best Estimator: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=1, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=4000, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)
Best Score: 0.9601808954026029
Train Score: 0.9663768023420187
Valid Score: 0.9550378642022846


['models/best_xgb.pkl']

In [10]:
model = joblib.load("Models/best_xgb.pkl")
fi = pd.Series(model.feature_importances_).sort_values(ascending=False)
fi = fi.to_frame().reset_index(drop=False)
a = fi['index']
b = fi[0]
col = list(X.columns.values)
for i in range(10):
    print(col[a[i]], ":", b[i])
# print(X.info())

Lifetime : 0.37678197
Contract_period : 0.28499743
Age : 0.13168238
Avg_class_frequency_total : 0.07265435
Avg_additional_charges_total : 0.049443886
Group_visits : 0.047559522
Promo_friends : 0.022872498
Near_Location : 0.014007936
gender : 0.0
Partner : 0.0


In [15]:
# 데이터 로드
df = pd.read_csv('Data/gym_churn_sample.csv')
# X, y 분할
X = df.drop(columns=['Churn', 'Phone', 'Month_to_end_contract', 'Avg_class_frequency_current_month'])
y = df['Churn']
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gender                        20 non-null     int64  
 1   Near_Location                 20 non-null     int64  
 2   Partner                       20 non-null     int64  
 3   Promo_friends                 20 non-null     int64  
 4   Contract_period               20 non-null     int64  
 5   Group_visits                  20 non-null     int64  
 6   Age                           20 non-null     int64  
 7   Avg_additional_charges_total  20 non-null     float64
 8   Lifetime                      20 non-null     int64  
 9   Avg_class_frequency_total     20 non-null     float64
dtypes: float64(2), int64(8)
memory usage: 1.7 KB
None


In [16]:
model = joblib.load('models/best_xgb.pkl')

In [17]:
pred = model.predict(X)

In [18]:
pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [19]:
y

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    1
14    0
15    0
16    0
17    0
18    0
19    1
Name: Churn, dtype: int64

In [20]:
pred_proba = model.predict_proba(X)

In [21]:
pred_proba

array([[9.5015842e-01, 4.9841568e-02],
       [9.9961537e-01, 3.8464557e-04],
       [3.5418630e-01, 6.4581370e-01],
       [9.9976838e-01, 2.3163031e-04],
       [9.9791932e-01, 2.0806785e-03],
       [8.8658261e-01, 1.1341736e-01],
       [9.9870986e-01, 1.2901254e-03],
       [9.4080824e-01, 5.9191741e-02],
       [9.2832935e-01, 7.1670637e-02],
       [6.0618842e-01, 3.9381155e-01],
       [9.9738795e-01, 2.6120637e-03],
       [9.9516219e-01, 4.8378347e-03],
       [9.8681474e-01, 1.3185289e-02],
       [6.3931137e-01, 3.6068863e-01],
       [9.8650402e-01, 1.3495977e-02],
       [9.9529535e-01, 4.7046267e-03],
       [9.9999291e-01, 7.0671690e-06],
       [9.9551511e-01, 4.4849003e-03],
       [9.8200154e-01, 1.7998435e-02],
       [3.1095326e-02, 9.6890467e-01]], dtype=float32)