# 성능튜닝

## 1.환경준비

### (1) import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import * 

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) 데이터 준비

* 변수설명
    * COLLEGE : 대학 졸업여부
    * INCOME : 연수입
    * OVERAGE : 월평균 초과사용 시간(분)
    * LEFTOVER : 월평균 잔여시간비율(%)
    * HOUSE : 집값
    * HANDSET_PRICE : 스마트폰 가격
    * OVER_15MINS_CALLS_PER_MONTH : 월평균 장기통화(15분이상) 횟수
    * AVERAGE_CALL_DURATION : 평균 통화 시간
    * REPORTED_SATISFACTION : 만족도 설문조사 결과
    * REPORTED_USAGE_LEVEL : 사용도 자가진단 결과
    * CONSIDERING_CHANGE_OF_PLAN : 향후 변경계획 설문조사 결과
    * CHURN : 이탈(번호이동) 여부 (1-이탈, 0-잔류, Target 변수)


In [2]:
# 데이터를 불러옵시다.
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
data = data.sample(5000, random_state = 2022)
data['CHURN'] = data['CHURN'].map({'LEAVE':1, 'STAY':0})
# sklearn에서는 y 가변수화 할 필요 없다.
# 그런데 statsmodels 라이브러리에서는 반드시 1, 0 이어야함.
# 그리고 tensorflow, keras에서도 1, 0 으로 만들어야 함.
# 여기서는 왜 필요? 전진선택법에서 aic기반 선택할 때 statsmodel의 logistic 회귀를 가져왔기 때문
# (forward_stepwise_logistic 함수)

data.head()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
3178,3179,0,119512,51,31,248566,229,5,2,very_sat,very_high,considering,1
14926,14927,1,142144,192,15,774317,581,29,4,unsat,very_little,never_thought,1
15116,15117,1,142308,0,79,306426,497,1,1,sat,little,considering,0
12733,12734,1,113385,0,0,333599,819,1,6,very_unsat,very_high,considering,1
14032,14033,1,90348,209,10,637286,360,26,4,unsat,little,actively_looking_into_it,0


## 2.데이터 준비

### (1) 데이터 정리

In [3]:
drop_cols = ['id']
data.drop(drop_cols, axis = 1, inplace = True )

### (2) 데이터분할1 : x, y 나누기

In [4]:
target = 'CHURN'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

In [5]:
dumm_cols = ['REPORTED_SATISFACTION','REPORTED_USAGE_LEVEL','CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(x, columns = dumm_cols, drop_first = True)

### (5) 데이터분할2 : train : validation 나누기

In [6]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 20)

### (6) Scaling

In [7]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## 3.선형모델 튜닝

Logistic Regression : 전진선택법
* 변수를 하나씩 늘려가면서
* AIC를 가장 낮추는 모델 찾기

### (1) 전진선택을 수행할 함수 만들기( **로지스틱 회귀** 용)

In [8]:
# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train): # 전진선택법 로지스틱 회귀

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit()
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

### (2) 전진선택법 수행

In [9]:
vars, result = forward_stepwise_logistic(x_train, y_train)
#vars: 선택된 변수들, result: 어떻게 선택했는지 DF에 저장하는 변수

Optimization terminated successfully.
         Current function value: 0.693075
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693062
         Iterations 2
Optimization terminated successfully.
         Current function value: 0.683528
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693007
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.684909
         Iterations 2
Optimization terminated successfully.
         Current function value: 0.693042
         Iterations 2
Optimization terminated successfully.
         Current function value: 0.686899
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692565
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692806
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693137
  

Optimization terminated successfully.
         Current function value: 0.633888
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633674
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633874
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633866
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633847
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633112
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.632641
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633102
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.632954
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.632697
  

* 선택된 변수

In [10]:
vars

['OVERAGE',
 'HOUSE',
 'HANDSET_PRICE',
 'LEFTOVER',
 'REPORTED_SATISFACTION_very_sat',
 'INCOME',
 'REPORTED_SATISFACTION_sat']

In [11]:
list(x_train)
# 원래 변수는 이렇게 많음.

['COLLEGE',
 'INCOME',
 'OVERAGE',
 'LEFTOVER',
 'HOUSE',
 'HANDSET_PRICE',
 'OVER_15MINS_CALLS_PER_MONTH',
 'AVERAGE_CALL_DURATION',
 'REPORTED_SATISFACTION_sat',
 'REPORTED_SATISFACTION_unsat',
 'REPORTED_SATISFACTION_very_sat',
 'REPORTED_SATISFACTION_very_unsat',
 'REPORTED_USAGE_LEVEL_high',
 'REPORTED_USAGE_LEVEL_little',
 'REPORTED_USAGE_LEVEL_very_high',
 'REPORTED_USAGE_LEVEL_very_little',
 'CONSIDERING_CHANGE_OF_PLAN_considering',
 'CONSIDERING_CHANGE_OF_PLAN_never_thought',
 'CONSIDERING_CHANGE_OF_PLAN_no',
 'CONSIDERING_CHANGE_OF_PLAN_perhaps']

In [12]:
result

Unnamed: 0,step,feature,aic
0,1.0,[OVERAGE],4786.699456
1,1.0,[HOUSE],4796.363859
2,1.0,[OVER_15MINS_CALLS_PER_MONTH],4810.294604
3,1.0,[REPORTED_SATISFACTION_very_sat],4845.064834
4,1.0,[AVERAGE_CALL_DURATION],4849.951663
...,...,...,...
114,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.309949
115,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.315315
116,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.316816
117,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.448187


In [13]:
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

### (3) 모델링

* 전체 변수 

In [14]:
m1 = LogisticRegression()
m1.fit(x_train, y_train)
p1 = m1.predict(x_val)

print(accuracy_score(y_val, p1))
print(classification_report(y_val, p1))

0.6333333333333333
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       738
           1       0.65      0.59      0.62       762

    accuracy                           0.63      1500
   macro avg       0.63      0.63      0.63      1500
weighted avg       0.64      0.63      0.63      1500



* 전진선택법 변수

In [15]:
m2 = LogisticRegression()
m2.fit(x_train[vars], y_train)
p2 = m2.predict(x_val[vars])

print(accuracy_score(y_val, p2))
print(classification_report(y_val, p2))

0.634
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       738
           1       0.66      0.59      0.62       762

    accuracy                           0.63      1500
   macro avg       0.64      0.63      0.63      1500
weighted avg       0.64      0.63      0.63      1500



## 4.하이퍼파라미터 튜닝

### (1) 필요한 함수 불러오기 

In [16]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

### (2) Random Search

① 값의 범위를 지정한다.  
② 모델 선언(시도 횟수 지정)  
③ 모델링(값의 범위 내에서 시도 횟수만큼 랜덤하게 선택해서 시도한다.)  
④ 가장 성능이 좋은 값을 선정


#### ① 값의 범위를 지정한다.

In [17]:
# dictionary형태로 선언
params = { 'n_neighbors' : range(1,51), 'metric' : ['euclidean', 'manhattan']  }
params

{'n_neighbors': range(1, 51), 'metric': ['euclidean', 'manhattan']}

#### ② 모델 선언

In [18]:
# 기본모델
model = KNeighborsClassifier()

# Random Search 설정.
model_rs = RandomizedSearchCV(model
                            , params              # hyperparameter 범위 지정.
                            , cv=5                    # k-fold Cross Validation
                            , n_iter=5                # Random하게 시도할 횟수
                            )

#### ③ 모델링

In [19]:
# 학습 : model이 아니라 model_rs
model_rs.fit(x_train_s, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=5,
                   param_distributions={'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': range(1, 51)})

In [20]:
# 튜닝 결과
model_rs.cv_results_

#params하고 mean_test_score보자.

{'mean_fit_time': array([0.00138106, 0.00189905, 0.00298867, 0.00180182, 0.00172544]),
 'std_fit_time': array([0.00050243, 0.00058234, 0.00108816, 0.0007399 , 0.00074265]),
 'mean_score_time': array([0.09201694, 0.107972  , 0.10778399, 0.09021649, 0.11508393]),
 'std_score_time': array([0.00444361, 0.00868424, 0.00484474, 0.0051229 , 0.01292402]),
 'param_n_neighbors': masked_array(data=[26, 45, 25, 2, 6],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_metric': masked_array(data=['euclidean', 'euclidean', 'euclidean', 'euclidean',
                    'manhattan'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 26, 'metric': 'euclidean'},
  {'n_neighbors': 45, 'metric': 'euclidean'},
  {'n_neighbors': 25, 'metric': 'euclidean'},
  {'n_neighbors': 2, 'metric': 'euclidean'},
  {'n_neighbors': 6, 'metric': 'manhattan'}],
 'split0_test

In [21]:
model_rs.cv_results_['params']

[{'n_neighbors': 26, 'metric': 'euclidean'},
 {'n_neighbors': 45, 'metric': 'euclidean'},
 {'n_neighbors': 25, 'metric': 'euclidean'},
 {'n_neighbors': 2, 'metric': 'euclidean'},
 {'n_neighbors': 6, 'metric': 'manhattan'}]

In [22]:
model_rs.cv_results_['mean_test_score']

array([0.58657143, 0.59      , 0.58457143, 0.56457143, 0.57857143])

In [23]:
# 최적의 파라미터
model_rs.best_params_

{'n_neighbors': 45, 'metric': 'euclidean'}

In [24]:
# 그때의 성능
model_rs.best_score_

0.5900000000000001

In [25]:
# best 모델로 예측 및 평가
pred = model_rs.predict(x_val_s) #랜덤서치로 튜닝시도한 모델, .predict => 뭘로 시도하지? 가장 좋은걸로 지가 알아서 시도해줌.
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.58      0.70      0.63       738
           1       0.63      0.50      0.56       762

    accuracy                           0.60      1500
   macro avg       0.60      0.60      0.60      1500
weighted avg       0.60      0.60      0.60      1500



### (3) 실습 : Random Search

* decision tree로 튜닝을 시도해 봅시다.
    * max_depth : 1~10
    * min_samples_leaf : 10 ~ 100

#### ① 값의 범위를 지정한다.

In [26]:
params = {'max_depth':range(1, 11), 'min_samples_leaf':range(10, 101)}
params

{'max_depth': range(1, 11), 'min_samples_leaf': range(10, 101)}

#### ② 모델 선언

In [27]:
model = DecisionTreeClassifier()
model_rs2 = RandomizedSearchCV(model, params, cv=5, n_iter=5)

#### ③ 모델링

In [28]:
model_rs2.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=5,
                   param_distributions={'max_depth': range(1, 11),
                                        'min_samples_leaf': range(10, 101)})

In [29]:
pred2 = model_rs2.predict(x_val)

In [30]:
print(classification_report(y_val, pred2))

              precision    recall  f1-score   support

           0       0.69      0.68      0.68       738
           1       0.69      0.70      0.69       762

    accuracy                           0.69      1500
   macro avg       0.69      0.69      0.69      1500
weighted avg       0.69      0.69      0.69      1500



### (4) Grid Search

① 값의 범위를 지정한다.  
② 모델링(값의 범위 내에서 모든 조합을 다 시도한다.)  
③ 가장 성능이 좋은 값을 선정


#### ① 값의 범위를 지정한다.

In [31]:
# dictionary형태로 선언
params = { 'n_neighbors' : range(3,31,2), 'metric' : ['euclidean', 'manhattan']  }
params
# 정수 순차 증가 : range
# 소수 순차 증가 : numpy의 linearspace

{'n_neighbors': range(3, 31, 2), 'metric': ['euclidean', 'manhattan']}

#### ② 모델 선언

In [32]:
# 기본모델
model = KNeighborsClassifier()

# Random Search 설정.
model_gs = GridSearchCV(model, params, cv=5)

#### ③ 모델링

In [33]:
# 학습 : model이 아니라 model_gs
model_gs.fit(x_train_s, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(3, 31, 2)})

In [34]:
# 튜닝 결과
model_gs.cv_results_

{'mean_fit_time': array([0.00350904, 0.00181975, 0.00173383, 0.00149984, 0.00215364,
        0.00179567, 0.0015326 , 0.0015933 , 0.00171347, 0.00170259,
        0.00155649, 0.00158939, 0.00240107, 0.00293903, 0.00163531,
        0.00172234, 0.00148883, 0.00180912, 0.00198264, 0.00127578,
        0.0019268 , 0.00159225, 0.00185447, 0.0016571 , 0.00176024,
        0.0019453 , 0.00153208, 0.00157638]),
 'std_fit_time': array([0.00398777, 0.00041509, 0.00073345, 0.00045183, 0.0004234 ,
        0.00074677, 0.00040074, 0.00048973, 0.00063682, 0.00040476,
        0.00050213, 0.00081784, 0.0018109 , 0.00202323, 0.00051643,
        0.00075696, 0.00051397, 0.00021116, 0.00062534, 0.00029913,
        0.00052717, 0.00049281, 0.00046179, 0.00035313, 0.00039178,
        0.00062229, 0.00045114, 0.00047777]),
 'mean_score_time': array([0.13120441, 0.09027948, 0.08511977, 0.08263245, 0.09669704,
        0.09404168, 0.09652243, 0.09237299, 0.10776124, 0.09260616,
        0.09745083, 0.10592351, 0.110703

In [35]:
model_gs.cv_results_['params']

[{'metric': 'euclidean', 'n_neighbors': 3},
 {'metric': 'euclidean', 'n_neighbors': 5},
 {'metric': 'euclidean', 'n_neighbors': 7},
 {'metric': 'euclidean', 'n_neighbors': 9},
 {'metric': 'euclidean', 'n_neighbors': 11},
 {'metric': 'euclidean', 'n_neighbors': 13},
 {'metric': 'euclidean', 'n_neighbors': 15},
 {'metric': 'euclidean', 'n_neighbors': 17},
 {'metric': 'euclidean', 'n_neighbors': 19},
 {'metric': 'euclidean', 'n_neighbors': 21},
 {'metric': 'euclidean', 'n_neighbors': 23},
 {'metric': 'euclidean', 'n_neighbors': 25},
 {'metric': 'euclidean', 'n_neighbors': 27},
 {'metric': 'euclidean', 'n_neighbors': 29},
 {'metric': 'manhattan', 'n_neighbors': 3},
 {'metric': 'manhattan', 'n_neighbors': 5},
 {'metric': 'manhattan', 'n_neighbors': 7},
 {'metric': 'manhattan', 'n_neighbors': 9},
 {'metric': 'manhattan', 'n_neighbors': 11},
 {'metric': 'manhattan', 'n_neighbors': 13},
 {'metric': 'manhattan', 'n_neighbors': 15},
 {'metric': 'manhattan', 'n_neighbors': 17},
 {'metric': 'manha

In [36]:
model_gs.cv_results_['mean_test_score'] 

array([0.57885714, 0.57942857, 0.57714286, 0.57342857, 0.568     ,
       0.57457143, 0.58085714, 0.58257143, 0.57914286, 0.58057143,
       0.586     , 0.58457143, 0.57971429, 0.58028571, 0.57314286,
       0.59142857, 0.59171429, 0.59142857, 0.59828571, 0.59771429,
       0.59942857, 0.60142857, 0.60171429, 0.61057143, 0.61      ,
       0.61142857, 0.61857143, 0.618     ])

In [37]:
# 최적의 파라미터
model_gs.best_params_

{'metric': 'manhattan', 'n_neighbors': 27}

In [38]:
# 그때의 성능
model_gs.best_score_

0.6185714285714285

In [39]:
# best 모델로 예측 및 평가
pred = model_gs.predict(x_val_s)
print(classification_report(y_val, pred))

#accuracy : 정분류율

              precision    recall  f1-score   support

           0       0.59      0.72      0.65       738
           1       0.65      0.51      0.57       762

    accuracy                           0.61      1500
   macro avg       0.62      0.61      0.61      1500
weighted avg       0.62      0.61      0.61      1500



### (5) 실습 : Grid Search

* decision tree로 튜닝을 시도해 봅시다.
    * max_depth : 1~10
    * min_samples_leaf : 10 ~ 100

#### ① 값의 범위를 지정한다.

In [40]:
params = {'max_depth':range(1, 11), 'min_samples_leaf':range(10, 101, 10)}
params

{'max_depth': range(1, 11), 'min_samples_leaf': range(10, 101, 10)}

#### ② 모델 선언

In [51]:
model = DecisionTreeClassifier()
model_gs2 = GridSearchCV(model, params, cv=5, verbose=2)
# verbose : 1보다 크면(2) 시간만 보여줌
#2보다 크면 score
#3보다 크면 시간+score

#### ③ 모델링

In [52]:
model_gs2.fit(x_train, y_train)
# 하이퍼 파라미터 값을 조정하면서 모델만듦
# 모델 만들고 좋은 모델인지 아닌지 알려면 검증해야함
# 검증하면 성능나옴 (best_score_)

# .fit하면 조정->모델만들고->평가 함.
# 튜닝하면서 검증한 것 중 가장 성능이 좋은 애가 best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ...................max_depth=1, min_samples_leaf=10; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=10; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=10; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=10; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=10; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=1, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=1, min_s

[CV] END ...................max_depth=3, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=3, min_samples_leaf=40; total time=   0.0s
[CV] END ...................

[CV] END ...................max_depth=5, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=20; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=30; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=5, min_samples_leaf=40; total time=   0.0s
[CV] END ...................

[CV] END ...................max_depth=7, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=40; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=7, min_samples_leaf=60; total time=   0.0s
[CV] END ...................

[CV] END ...................max_depth=9, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=50; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=60; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=70; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=70; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=70; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=70; total time=   0.0s
[CV] END ...................max_depth=9, min_samples_leaf=70; total time=   0.0s
[CV] END ...................

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(10, 101, 10)},
             verbose=2)

In [43]:
model_gs2.cv_results_

{'mean_fit_time': array([0.00693169, 0.00584555, 0.00530314, 0.00520706, 0.00604177,
        0.00558372, 0.00573707, 0.00580063, 0.00579801, 0.00597653,
        0.00782967, 0.00736585, 0.00813313, 0.00735502, 0.00786867,
        0.00800309, 0.00741339, 0.00719814, 0.0076704 , 0.00730762,
        0.00988288, 0.00829854, 0.00911789, 0.00871925, 0.0093061 ,
        0.00958619, 0.00934649, 0.01001916, 0.01141081, 0.0143167 ,
        0.01228786, 0.01286716, 0.01223888, 0.01061211, 0.01081963,
        0.01162362, 0.01305203, 0.01159658, 0.01087236, 0.01042953,
        0.01212044, 0.01279464, 0.01409078, 0.01268468, 0.0119483 ,
        0.01429372, 0.01517897, 0.01186371, 0.01227722, 0.01143355,
        0.0145299 , 0.01365714, 0.01321006, 0.01495323, 0.01256204,
        0.01294026, 0.012044  , 0.0116374 , 0.017594  , 0.01649156,
        0.02195392, 0.02131963, 0.01963868, 0.01448493, 0.0144033 ,
        0.01597538, 0.012498  , 0.01297464, 0.01205463, 0.01245193,
        0.01553721, 0.01595817,

In [44]:
model_gs2.cv_results_['params']

[{'max_depth': 1, 'min_samples_leaf': 10},
 {'max_depth': 1, 'min_samples_leaf': 20},
 {'max_depth': 1, 'min_samples_leaf': 30},
 {'max_depth': 1, 'min_samples_leaf': 40},
 {'max_depth': 1, 'min_samples_leaf': 50},
 {'max_depth': 1, 'min_samples_leaf': 60},
 {'max_depth': 1, 'min_samples_leaf': 70},
 {'max_depth': 1, 'min_samples_leaf': 80},
 {'max_depth': 1, 'min_samples_leaf': 90},
 {'max_depth': 1, 'min_samples_leaf': 100},
 {'max_depth': 2, 'min_samples_leaf': 10},
 {'max_depth': 2, 'min_samples_leaf': 20},
 {'max_depth': 2, 'min_samples_leaf': 30},
 {'max_depth': 2, 'min_samples_leaf': 40},
 {'max_depth': 2, 'min_samples_leaf': 50},
 {'max_depth': 2, 'min_samples_leaf': 60},
 {'max_depth': 2, 'min_samples_leaf': 70},
 {'max_depth': 2, 'min_samples_leaf': 80},
 {'max_depth': 2, 'min_samples_leaf': 90},
 {'max_depth': 2, 'min_samples_leaf': 100},
 {'max_depth': 3, 'min_samples_leaf': 10},
 {'max_depth': 3, 'min_samples_leaf': 20},
 {'max_depth': 3, 'min_samples_leaf': 30},
 {'max_de

In [45]:
model_gs2.cv_results_['mean_test_score'] 
# mean_test_score : 5개 CV 세트에서 검증용 데이터 세트의 정확도 평균 수치

array([0.614     , 0.614     , 0.614     , 0.614     , 0.614     ,
       0.614     , 0.614     , 0.614     , 0.614     , 0.614     ,
       0.66142857, 0.66142857, 0.66142857, 0.66142857, 0.66142857,
       0.66142857, 0.66142857, 0.66142857, 0.66142857, 0.66142857,
       0.692     , 0.69314286, 0.69314286, 0.69314286, 0.69314286,
       0.69314286, 0.69314286, 0.69314286, 0.69314286, 0.69514286,
       0.68942857, 0.68857143, 0.68942857, 0.68971429, 0.68942857,
       0.68914286, 0.69285714, 0.69342857, 0.69742857, 0.69314286,
       0.686     , 0.69285714, 0.69514286, 0.69314286, 0.68857143,
       0.688     , 0.69057143, 0.69085714, 0.69771429, 0.69342857,
       0.67885714, 0.68428571, 0.68714286, 0.68657143, 0.69114286,
       0.68742857, 0.69142857, 0.68971429, 0.69571429, 0.69228571,
       0.67428571, 0.68828571, 0.70028571, 0.69628571, 0.68971429,
       0.686     , 0.69      , 0.69114286, 0.69542857, 0.69228571,
       0.668     , 0.688     , 0.694     , 0.694     , 0.69228

In [46]:
model_gs2.best_params_

{'max_depth': 7, 'min_samples_leaf': 30}

In [47]:
model_gs2.best_score_

0.7002857142857143

In [48]:
pred_gs2 = model_gs2.predict(x_val)

In [53]:
print(confusion_matrix(y_val, pred_gs2))

[[517 221]
 [261 501]]


In [49]:
print(classification_report(y_val, pred_gs2))

# 왜 best_score_ 와 classification_report의 accuracy가 다르게 나올까?
# train과 validation 데이터가 다르니까!

# best_score_는 train 중 검증된 것 중 가장 성능이 좋은 것들
# classification_report는 validation으로 predict 후 보여주는 거니까 best_score_과 값이 다를 수 있음

# 차이가 많이 나는 것은 ? 
# 데이터가 잘 섞이지 않았거나 데이터가 너무 적어서 오차가 많이 나는 것들

              precision    recall  f1-score   support

           0       0.66      0.70      0.68       738
           1       0.69      0.66      0.68       762

    accuracy                           0.68      1500
   macro avg       0.68      0.68      0.68      1500
weighted avg       0.68      0.68      0.68      1500

