In [1]:
import warnings
warnings.filterwarnings('ignore')

### 교차 검증과 그리드 서치
- 머신러닝을 사용할 때 모델의 정확도를 측정하기 위해 반드시 사용해야 하는 방법 <br><br>
- 딥러닝시에는 데이터의 크기가 크므로 이 방법은 사용할 필요가 없다.

In [2]:
import pandas as pd

wine = pd.read_csv('../Data/wine.csv')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
data = wine.iloc[:,:3].to_numpy()
target = wine['class'].to_numpy()

### 검증 세트 추가
- 훈련 : 60% 검증 : 20% 테스트 : 20%

In [4]:
from sklearn.model_selection import train_test_split

train_input , test_input , train_target , test_target = \
    train_test_split(data,target,test_size=0.2,random_state=42)

In [5]:
sub_input , val_input , sub_target , val_target = \
    train_test_split(train_input,train_target,test_size=0.2,random_state=42)

In [6]:
# 세트별 크기 비교
print('train : ', sub_input.shape)
print('val : ', val_input.shape)
print('test : ', test_input.shape)

train :  (4157, 3)
val :  (1040, 3)
test :  (1300, 3)


In [7]:
# 모델 tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input,sub_target)

print('train : ', dt.score(sub_input,sub_target))
print('valid : ',dt.score(val_input,val_target))


train :  0.9971133028626413
valid :  0.864423076923077


In [8]:
# 교차 검증 : 최소 5번은 해야 된다. 
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
scores

{'fit_time': array([0.004498  , 0.00748801, 0.00470185, 0.00515771, 0.00444818]),
 'score_time': array([0.00053191, 0.00067616, 0.00048995, 0.0005703 , 0.00047088]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [9]:
# 교차 검증후의 정확도 판단
import numpy as np
np.mean(scores['test_score'])

0.855300214703487

### KFold를 이용한 방법

In [10]:
from sklearn.model_selection import StratifiedKFold
splitter = StratifiedKFold()
scores = cross_validate(dt,train_input,train_target,cv=splitter)
scores

{'fit_time': array([0.0045619 , 0.00542378, 0.00539994, 0.00450516, 0.00469708]),
 'score_time': array([0.00051308, 0.00105   , 0.00054216, 0.00046682, 0.00062299]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [11]:
# KFold의 Fold 중 10개로 나누어서 교차검증
splitter = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_validate(dt,train_input,train_target,cv=splitter)
scores

{'fit_time': array([0.00619102, 0.00507331, 0.0051291 , 0.00776386, 0.00509405,
        0.00578904, 0.00518918, 0.00493693, 0.0073421 , 0.00493121]),
 'score_time': array([0.00048804, 0.00042987, 0.00041389, 0.00044012, 0.00045204,
        0.00053   , 0.00039983, 0.00054884, 0.00042295, 0.00039482]),
 'test_score': array([0.83461538, 0.87884615, 0.85384615, 0.85384615, 0.84615385,
        0.87307692, 0.85961538, 0.85549133, 0.85163776, 0.86705202])}

---
### 그리드 서치(Grid Search)를 이용한 최적의 Hyper Parameter 값 찾기

In [12]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease':[0.0001 , 0.0002 , 0.0003 , 0.0004 , 0.0005]}

In [13]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params , n_jobs=-1)

In [14]:
gs.fit(train_input,train_target)

In [15]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [16]:
gs.best_params_

{'min_impurity_decrease': 0.0001}

In [17]:
# 교차 검증
gs.cv_results_['mean_test_score'].mean()

0.8666152735618569