In [1]:
import os
import pandas as pd
import numpy as np 

from scipy.stats import uniform, randint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline


from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
from sklearn.compose import make_column_transformer

In [3]:
df_raw = pd.read_csv('./data/Customer-Churn-Records.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RowNumber           10000 non-null  int64  
 1   CustomerId          10000 non-null  int64  
 2   Surname             10000 non-null  object 
 3   CreditScore         10000 non-null  int64  
 4   Geography           10000 non-null  object 
 5   Gender              10000 non-null  object 
 6   Age                 10000 non-null  int64  
 7   Tenure              10000 non-null  int64  
 8   Balance             10000 non-null  float64
 9   NumOfProducts       10000 non-null  int64  
 10  HasCrCard           10000 non-null  int64  
 11  IsActiveMember      10000 non-null  int64  
 12  EstimatedSalary     10000 non-null  float64
 13  Exited              10000 non-null  int64  
 14  Complain            10000 non-null  int64  
 15  Satisfaction Score  10000 non-null  int64  
 16  Card 

In [4]:
df_bank = df_raw[[ 'Exited', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',  'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'Satisfaction Score', 'Geography', 'Gender', 'Card Type' ]].copy()


In [5]:
data = df_bank[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',  'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'Satisfaction Score', 'Geography', 'Gender', 'Card Type']].to_numpy()

target = df_bank['Exited'].to_numpy()
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [6]:
data[3,10]

'France'

In [7]:
numeric_features = [num for num in range(0,10)] 
numeric_transformer = StandardScaler()

categorical_features = [10, 11 ,12]
categorical_transformer = OneHotEncoder()

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features)
)

dists = {
    'gradientboostingclassifier__min_impurity_decrease': uniform(0.0001, 0.001),
    'gradientboostingclassifier__max_depth' : randint(5, 30),
    'gradientboostingclassifier__max_features' : randint(3, 12),
    'gradientboostingclassifier__n_estimators' : [60, 80, 100, 120, 140, 180]
}

pipe = make_pipeline(
    # StandardScaler(),  
    preprocessor,
    GradientBoostingClassifier() 
)
rscv= RandomizedSearchCV(
    pipe,
    param_distributions=dists, # 파라미터 입력
    n_iter = 200,   # random search 탐색 횟수
    cv = 5,        # cv 검증을 위한 분할 검증 횟수
    scoring='accuracy',  # 오차 평가방법
    verbose=1,     # 진행상황
    random_state = 2,
    n_jobs=-1
  )

rscv.fit(train_input, train_target)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [8]:
print('best_params_: ', rscv.best_params_)


pred_test = rscv.predict(test_input)
print('accuracy     ' , accuracy_score(test_target, pred_test))
print('precision    ' , precision_score(test_target, pred_test))
print('recall       ' , recall_score(test_target, pred_test))
print('F1           ',f1_score(test_target, pred_test))

best_params_:  {'gradientboostingclassifier__max_depth': 16, 'gradientboostingclassifier__max_features': 11, 'gradientboostingclassifier__min_impurity_decrease': np.float64(0.00042053643711160727), 'gradientboostingclassifier__n_estimators': 80}
accuracy      0.999
precision     0.9974554707379135
recall        0.9974554707379135
F1            0.9974554707379135


In [61]:
import pickle
with open('rscv_gbc_model.pkl', 'wb') as f:
    pickle.dump(rscv, f)



In [62]:
with open('rscv_gbc_model.pkl', 'rb') as f:
    model = pickle.load(f)

pred_test2 = model.predict(test_input)

print('accuracy     ' , accuracy_score(test_target, pred_test2))
print('precision    ' , precision_score(test_target, pred_test2))
print('recall       ' , recall_score(test_target, pred_test2))
print('F1           ',f1_score(test_target, pred_test2))

accuracy      0.999
precision     0.9974554707379135
recall        0.9974554707379135
F1            0.9974554707379135
