<a href="https://colab.research.google.com/github/Seungkyu-Han/colab_ml/blob/main/k_fold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 교차 검증

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)

kfold = KFold(n_splits=5)

cv_accuracy = []

print('붓꽃 데이터 세트 크기', features.shape[0])

붓꽃 데이터 세트 크기 150


In [3]:
n_iter = 0

for train_index, test_index in kfold.split(features):
  ## 팬시 인덱싱
  X_train, X_test = features[train_index], features[test_index]
  y_train, y_test = label[train_index], label[test_index]

  dt_clf.fit(X_train, y_train)
  pred = dt_clf.predict(X_test)
  n_iter += 1

  accuracy = np.round(accuracy_score(y_test, pred), 4)
  train_size = X_train.shape[0]
  test_size = X_test.shape[0]

  print(f'{n_iter} 교차 검증 정확도: {accuracy}, 학습 데이터 크기: {train_size}, 검증 데이터 크기: {test_size}')
  print(f'train_index: {train_index} test_index:{test_index}\n')
  print(f'{n_iter} 검증 세트 인덱스: {test_index}\n')
  cv_accuracy.append(accuracy)

print('\n평균 검증 정확도: ', np.mean(cv_accuracy))

1 교차 검증 정확도: 1.0, 학습 데이터 크기: 120, 검증 데이터 크기: 30
train_index: [ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149] test_index:[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]

1 검증 세트 인덱스: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]

2 교차 검증 정확도: 0.9667, 학습 데이터 크기: 120, 검증 데이터 크기: 30
train_index: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  60  61  62  63  64  65
  66  67  68  69  70  71  72 

### Stratified K 폴드

In [4]:
import pandas as pd

iris = load_iris()

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,50
1,50
2,50


In [5]:
kfold = KFold(n_splits=3)

n_iter = 0

for train_index, test_index in kfold.split(iris_df):

  n_iter += 1

  label_train = iris_df['label'].iloc[train_index]
  label_test = iris_df['label'].iloc[test_index]

  print('### 교차 검증: ', n_iter)
  print('학습 레이블 데이터 분포: \n', label_train.value_counts())

### 교차 검증:  1
학습 레이블 데이터 분포: 
 label
1    50
2    50
Name: count, dtype: int64
### 교차 검증:  2
학습 레이블 데이터 분포: 
 label
0    50
2    50
Name: count, dtype: int64
### 교차 검증:  3
학습 레이블 데이터 분포: 
 label
0    50
1    50
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_index, test_index in skf.split(iris_df, iris_df['label']):

  n_iter += 1

  label_train = iris_df['label'].iloc[train_index]
  label_test = iris_df['label'].iloc[test_index]

  print('### 교차 검증: ', n_iter)
  print('학습 레이블 데이터 분포: \n', label_train.value_counts())

### 교차 검증:  1
학습 레이블 데이터 분포: 
 label
2    34
0    33
1    33
Name: count, dtype: int64
### 교차 검증:  2
학습 레이블 데이터 분포: 
 label
1    34
0    33
2    33
Name: count, dtype: int64
### 교차 검증:  3
학습 레이블 데이터 분포: 
 label
0    34
1    33
2    33
Name: count, dtype: int64


### cross_val_score()

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import numpy as np

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

data = iris_data.data
label = iris_data.target

# k는 3으로 검증
scores = cross_val_score(dt_clf, data, label, scoring='accuracy', cv=3)

print('교차 검증별 정확도: ', np.round(scores, 4))
print('평균 검증 정확도: ', np.round(np.mean(scores), 4))

교차 검증별 정확도:  [0.98 0.94 0.98]
평균 검증 정확도:  0.9667


### GridSearchCV

In [8]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris_data.data, iris_data.target, test_size=0.2, random_state=121
)

d_tree = DecisionTreeClassifier()

parameters = {'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]}

In [12]:
import pandas as pd

grid_dtree = GridSearchCV(d_tree, param_grid=parameters, cv=3, refit=True, return_train_score=True)

grid_dtree.fit(X_train, y_train)

scores_df = pd.DataFrame(grid_dtree.cv_results_)

scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.00125,8.1e-05,0.000933,2.5e-05,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
1,0.001231,7.3e-05,0.000918,1.2e-05,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
2,0.001209,2.8e-05,0.000926,1.2e-05,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
3,0.001198,2.8e-05,0.000872,9e-06,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
4,0.001191,4.6e-05,0.000873,1.5e-05,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511
5,0.001138,3.9e-05,0.000867,2.7e-05,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511
