In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from collections import Counter   

In [3]:
data_dir = "D:\\study\\sknetworks\\team_project\\sk_2nd\\git\\"
df = pd.read_csv(data_dir + 'data_totla_split_non_index.csv')
df.head(2)

Unnamed: 0,msno,city,bd,registered_via,is_churn,is_back,payment_plan_sum,plan_list_price,actual_amount_paid,discount_rate,is_auto_renew,is_cancel,transaction_count,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,registration_init_time_year,registration_init_time_month,registration_init_time_day,membership_expire_date_year,membership_expire_date_month,membership_expire_date_day,log_end_year,log_end_month,log_end_day,log_start_year,log_start_month,log_start_day,gender_encoded
0,6597367,4,30.0,9,0,0,603,2980,3129,0.0,1.0,0.0,21,77,22,16,44,658,287,171508.646,2011,9,16,2017,5,17,2017,3,31,2015,2,9,1
1,3629318,5,34.0,9,1,0,360,1072,1072,0.0,0.0,0.0,2,145,84,58,42,1678,1158,448313.054,2011,9,16,2017,8,23,2017,3,30,2016,8,22,1


In [12]:
X = df.drop(['msno','is_churn'],axis=1)
y = df['is_churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
skf = StratifiedKFold(n_splits=5)
dt_clf = DecisionTreeClassifier(random_state=42)
params = {
    # "max_depth": [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
    "max_depth": [1,2,3,4,5,6,7,8,9,10,11,12]
}
gscv_tree = GridSearchCV (dt_clf, params, scoring ='accuracy', cv = skf)
gscv_tree.fit(X_train_scaled, y_train)
print(gscv_tree.best_estimator_)

DecisionTreeClassifier(max_depth=9, random_state=42)


In [14]:
stratified_kfold = StratifiedKFold(n_splits=5)
dt_clf = DecisionTreeClassifier(max_depth=9, random_state=42)

dt_accuracy = []
dt_precision = []
dt_recall = []

for train_index, val_index in stratified_kfold.split(X, y):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]

    dt_clf.fit(X_train, y_train)
    y_pred_dt = dt_clf.predict(X_val)   # 검증 데이터로 예측

    dt_accuracy.append(accuracy_score(y_val, y_pred_dt)) 
    
    dt_precision.append(precision_score(y_val, y_pred_dt)) 
    
    dt_recall.append(recall_score(y_val, y_pred_dt)) 

print("의사결정나무 정확도:", np.mean(dt_accuracy))
print("의사결정나무 정밀도:", np.mean(dt_precision))
print("의사결정나무 재현율:", np.mean(dt_recall))

의사결정나무 정확도: 0.9396757285244833
의사결정나무 정밀도: 0.8807587359060222
의사결정나무 재현율: 0.7589527947456112


In [16]:
counter = Counter(y)
print(counter)

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)

counter = Counter(y_res)
print(counter)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Counter({0: 303621, 1: 64589})




Counter({0: 303621, 1: 303621})


In [21]:
skf = StratifiedKFold(n_splits=5)
dt_clf = DecisionTreeClassifier(random_state=42)
params = {
    # "max_depth": [50,55,60,65,70,75,80,85,90,95,100]
    "max_depth": [46,47,48,49,50,51,52,53,54]
    # "max_depth": [26,27,28,29,30,35,40,45,50]
    # "max_depth": [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
    # "max_depth": [1,2,3,4,5,6,7,8,9,10,11,12]
}
gscv_tree = GridSearchCV (dt_clf, params, scoring ='accuracy', cv = skf)
gscv_tree.fit(X_train_scaled, y_train)
print(gscv_tree.best_estimator_)

DecisionTreeClassifier(max_depth=48, random_state=42)


In [23]:
stratified_kfold = StratifiedKFold(n_splits=5)
dt_clf = DecisionTreeClassifier(max_depth=48, random_state=42)

dt_accuracy = []
dt_precision = []
dt_recall = []

for train_index, val_index in stratified_kfold.split(X_res, y_res):
    X_train, y_train = X_res.iloc[train_index], y_res.iloc[train_index]
    X_val, y_val = X_res.iloc[val_index], y_res.iloc[val_index]

    dt_clf.fit(X_train, y_train)
    y_pred_dt = dt_clf.predict(X_val)   # 검증 데이터로 예측

    dt_accuracy.append(accuracy_score(y_val, y_pred_dt)) 
    
    dt_precision.append(precision_score(y_val, y_pred_dt)) 
    
    dt_recall.append(recall_score(y_val, y_pred_dt)) 

print("의사결정나무 정확도:", np.mean(dt_accuracy))
print("의사결정나무 정밀도:", np.mean(dt_precision))
print("의사결정나무 재현율:", np.mean(dt_recall))

의사결정나무 정확도: 0.9672041753545024
의사결정나무 정밀도: 0.9398506157794057
의사결정나무 재현율: 0.9983235675545007
