## 다양한 교차검증
---
 - model_selection 모듈
   - cross_validate()
   - cross_val_score()
   - cross_val_predict()

In [8]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

### [1] 데이터 로딩

In [3]:
iris=load_iris()

In [4]:
# Bunch타입(dict와 유사)
data=iris['data']
target=iris['target']
featureName=iris['feature_names'] # 컬럼명
className=iris['target_names']    # 클래스명

In [7]:
featureName, className

(['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 array(['setosa', 'versicolor', 'virginica'], dtype='<U10'))

### [2] 모델 생성

In [11]:
# max_iter=100(기본): 처음~끝까지 샘플데이터 학슴 => 에포크(epoch) 횟수 지정
lrModel=LogisticRegression(max_iter=500)

In [24]:
# 교차검증으로 모델 학습 진행
# => 학습 데이터 기반 5개 Fold로 학습 & 검증 진행
# cv=5(기본): 데이터를 몇개로 나눠 교차검증할지
result=cross_val_score(lrModel, data, target, cv=10)

# cv개 모델에 대한 정확도(accuracy)
result

array([1.        , 0.93333333, 1.        , 1.        , 0.93333333,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [27]:
allResult=cross_validate(lrModel, data, target, return_train_score=True, cv=10)
allResult

{'fit_time': array([0.06582332, 0.0602386 , 0.06482911, 0.04488301, 0.04590487,
        0.07798314, 0.05884552, 0.04488039, 0.04088974, 0.02992225]),
 'score_time': array([0.00099754, 0.00099421, 0.00099778, 0.00099421, 0.00099707,
        0.0009973 , 0.00199151, 0.00099707, 0.00099945, 0.00099587]),
 'test_score': array([1.        , 0.93333333, 1.        , 1.        , 0.93333333,
        0.93333333, 0.93333333, 1.        , 1.        , 1.        ]),
 'train_score': array([0.97037037, 0.97777778, 0.97037037, 0.97037037, 0.97777778,
        0.97777778, 0.98518519, 0.97037037, 0.97037037, 0.97777778])}

In [28]:
pd.DataFrame(allResult)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.065823,0.000998,1.0,0.97037
1,0.060239,0.000994,0.933333,0.977778
2,0.064829,0.000998,1.0,0.97037
3,0.044883,0.000994,1.0,0.97037
4,0.045905,0.000997,0.933333,0.977778
5,0.077983,0.000997,0.933333,0.977778
6,0.058846,0.001992,0.933333,0.985185
7,0.04488,0.000997,1.0,0.97037
8,0.04089,0.000999,1.0,0.97037
9,0.029922,0.000996,1.0,0.977778


In [32]:
# splitter 객체 생성 - kFold()
# cv를 균등하게 데이터분배하도록 설정
from sklearn.model_selection import KFold, StratifiedKFold
kSplitter=KFold(n_splits=7, shuffle=True)


# return_train_score=False: 학습 데이터 평가 결과 반환여부 설정
# return_estimator=False: 모델객체명 반환여부 설정
allResult=cross_validate(lrModel, data, target, return_train_score=True, cv=kSplitter, return_estimator=True)
pd.DataFrame(allResult)

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,0.036899,0.000998,LogisticRegression(max_iter=500),0.954545,0.976562
1,0.033911,0.000998,LogisticRegression(max_iter=500),1.0,0.976562
2,0.043881,0.000999,LogisticRegression(max_iter=500),0.954545,0.976562
3,0.036901,0.000998,LogisticRegression(max_iter=500),0.952381,0.968992
4,0.036899,0.000998,LogisticRegression(max_iter=500),0.952381,0.968992
5,0.035905,0.000996,LogisticRegression(max_iter=500),0.952381,0.984496
6,0.041926,0.000991,LogisticRegression(max_iter=500),0.952381,0.976744
