### sklearn.model_selection.KFold

* class sklearn.model_selection.KFold(n_splits=5, *, shuffle=False, random_state=None)[source]¶


Parameters:

n_splitsint, default=5

Number of folds. Must be at least 2.

Changed in version 0.22: n_splits default value changed from 3 to 5.

shufflebool, default=False

Whether to shuffle the data before splitting into batches. Note that the samples within each split will not be shuffled.

random_stateint, RandomState instance or None, default=None

When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. Otherwise, this parameter has no effect. Pass an int for reproducible output across multiple function calls. See Glossary.

In [16]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np

iris = load_iris()
features = iris.data
label = iris.target
df_clf = DecisionTreeClassifier(random_state = 156)
iris_data = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size = 0.2, random_state=100)

Kfold = KFold(n_splits=5)
cv_accuracy = []
print('붓꽃 데이터 세트 크기:', features.shape[0])

붓꽃 데이터 세트 크기: 150


In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
iris

In [7]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

### pandas.DataFrame
* class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=None)

In [8]:
import pandas as pd

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [11]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [9]:
iris_df['label'] = iris.target

In [10]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   label              150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold


In [22]:
dt_clf = DecisionTreeClassifier(random_state=0)
kf = KFold(n_splits=5)

### sklearn.model_selection.train_test_split
sklearn.model_selection.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)[source]

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [24]:
kf.split(X_train)

<generator object _BaseKFold.split at 0x000001660AE6E560>

### Q : object를 어떻게 풀지? -> For 문

In [25]:
for i in kf.split(X_train):
    print(i)

(array([ 24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
        37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,
        50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
        63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
        76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
        89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
       102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
       115, 116, 117, 118, 119]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23]))
(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  48,  49,
        50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
        63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
        76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86

### -> 편향 되어있음 -> stratified 하면 편향 없어짐

In [26]:
for i, j in kf.split(X_train):
    print(i)

[ 24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41
  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  72  73  74  75  76  77

In [27]:
i

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95])

In [28]:
cv_accuracy = []

In [29]:
cv_accuracy


[]

In [30]:
n_iter = 0

In [37]:
cv_accuracy = []
n_iter = 0
# KFold객체의 split() 호출하면 폴드 별 학습용, 검증용 테스트으 로우 인덱스를 array로 반환
for train_index, valid_index in kf.split(X_train):
    # kfold,split()으로 변환된 인덱스를 이용하여 학습용, 검증용 테스트 데이터 추출
    X_train1, X_valid = X_train[train_index], X_train[valid_index]
    y_train1, y_valid = y_train[train_index], y_train[valid_index]
    # 학습 및 예측
    dt_clf.fit(X_train1, y_train1)
    pred = dt_clf.predict(X_valid)
    n_iter += 1
    # 반복 시마다 정확도 측정
    # accuracy = np.round(np.mean(pred == y_valid), 4)와 같다.
    accuracy = np.round(accuracy_score(y_valid,pred), 4)
    print(accuracy)
    cv_accuracy.append(accuracy)
    train_size = X_train1.shape[0]
    test_size = X_valid.shape[0]
    print(f'{n_iter} 교차검증 정확도 : {accuracy}, 학습데이터의 크기 : {train_size} , 검증데이터의 크기 : {test_size}')
    print(f'{n_iter} 검증 세트 인덱스{valid_index}')    

    

0.9583
1 교차검증 정확도 : 0.9583, 학습데이터의 크기 : 96 , 검증데이터의 크기 : 24
1 검증 세트 인덱스[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
1.0
2 교차검증 정확도 : 1.0, 학습데이터의 크기 : 96 , 검증데이터의 크기 : 24
2 검증 세트 인덱스[24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47]
0.8333
3 교차검증 정확도 : 0.8333, 학습데이터의 크기 : 96 , 검증데이터의 크기 : 24
3 검증 세트 인덱스[48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
0.9583
4 교차검증 정확도 : 0.9583, 학습데이터의 크기 : 96 , 검증데이터의 크기 : 24
4 검증 세트 인덱스[72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95]
0.9167
5 교차검증 정확도 : 0.9167, 학습데이터의 크기 : 96 , 검증데이터의 크기 : 24
5 검증 세트 인덱스[ 96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]


In [38]:
import numpy as np
np.mean(cv_accuracy)

0.9333199999999999

### sklearn.model_selection.StratifiedKFold
* class sklearn.model_selection.StratifiedKFold(n_splits=5, *, shuffle=False, random_state=None)[source]

Parameters:

**n_splits**int, default=5

Number of folds. Must be at least 2.

Changed in version 0.22: `n_splits`  default value changed from 3 to 5.

**shuffle**bool, default=False

Whether to shuffle each class’s samples before splitting into batches. Note that the samples within each split will not be shuffled.

**random_state**int, RandomState instance or None, default=None

When  `shuffle`  is True,  `random_state`  affects the ordering of the indices, which controls the randomness of each fold for each class. Otherwise, leave  `random_state`  as  `None`. Pass an int for reproducible output across multiple function calls. See  [Glossary](https://scikit-learn.org/stable/glossary.html#term-random_state).

In [47]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
n_iter = 0

for train_index, test_index in skf.split(iris.data, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print('## 교차 검증: {0}'.format(n_iter))
    print('학습 레이블 데이터 분포:\n', label_train.value_counts())
    print('검증 레이블 데이터 분포:\n', label_test.value_counts())
    print(train_index)

## 교차 검증: 1
학습 레이블 데이터 분포:
 0    40
1    40
2    40
Name: label, dtype: int64
검증 레이블 데이터 분포:
 0    10
1    10
2    10
Name: label, dtype: int64
[ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
## 교차 검증: 2
학습 레이블 데이터 분포:
 0    40
1    40
2    40
Name: label, dtype: int64
검증 레이블 데이터 분포:
 0    10
1    10
2    10
Name: label, dtype: int64
[  0   1   2   3   4   5   6   7   8   9  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  50  51  52  53  54  55  56  57  58  59  70  71  72  73
  74 

In [48]:
cv_accuracy = []
n_iter = 0
skf = StratifiedKFold(n_splits = 5)
# KFold객체의 split() 호출하면 폴드 별 학습용, 검증용 테스트으 로우 인덱스를 array로 반환
for train_index, valid_index in skf.split(iris.data, iris.target):
    # kfold,split()으로 변환된 인덱스를 이용하여 학습용, 검증용 테스트 데이터 추출
    X_train1, X_valid = iris.data[train_index], iris.data[valid_index]
    y_train1, y_valid = iris.target[train_index], iris.target[valid_index]
    # 학습 및 예측
    dt_clf.fit(X_train1, y_train1)
    pred = dt_clf.predict(X_valid)
    n_iter += 1
    
    # 반복 시마다 정확도 측정
    # accuracy = np.round(np.mean(pred == y_valid), 4)와 같다.
    accuracy = np.round(accuracy_score(y_valid,pred), 4)
    print(accuracy)
    cv_accuracy.append(accuracy)
    train_size = X_train1.shape[0]
    test_size = X_valid.shape[0]
    print('\n#{0} 교차검증 정확도 : {1}, 학습데이터의 크기 : {2} , 검증데이터의 크기 : {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print( '#{0} 검증 세트 인덱스{1}'.format(n_iter, test_index)) 
    
# 교차 검증별 정확도 및 평균 정확도 계산
print('\n## 교차검증 정확도:', np.round(cv_accuracy, 4))
print('## 평균 검증 정확도:', np.mean(cv_accuracy))

0.9667

#1 교차검증 정확도 : 0.9667, 학습데이터의 크기 : 120 , 검증데이터의 크기 : 30
#1 검증 세트 인덱스[ 40  41  42  43  44  45  46  47  48  49  90  91  92  93  94  95  96  97
  98  99 140 141 142 143 144 145 146 147 148 149]
0.9667

#2 교차검증 정확도 : 0.9667, 학습데이터의 크기 : 120 , 검증데이터의 크기 : 30
#2 검증 세트 인덱스[ 40  41  42  43  44  45  46  47  48  49  90  91  92  93  94  95  96  97
  98  99 140 141 142 143 144 145 146 147 148 149]
0.9

#3 교차검증 정확도 : 0.9, 학습데이터의 크기 : 120 , 검증데이터의 크기 : 30
#3 검증 세트 인덱스[ 40  41  42  43  44  45  46  47  48  49  90  91  92  93  94  95  96  97
  98  99 140 141 142 143 144 145 146 147 148 149]
0.9667

#4 교차검증 정확도 : 0.9667, 학습데이터의 크기 : 120 , 검증데이터의 크기 : 30
#4 검증 세트 인덱스[ 40  41  42  43  44  45  46  47  48  49  90  91  92  93  94  95  96  97
  98  99 140 141 142 143 144 145 146 147 148 149]
1.0

#5 교차검증 정확도 : 1.0, 학습데이터의 크기 : 120 , 검증데이터의 크기 : 30
#5 검증 세트 인덱스[ 40  41  42  43  44  45  46  47  48  49  90  91  92  93  94  95  96  97
  98  99 140 141 142 143 144 145 146 147 148 149]

## 교차검증 정확도: [0.9667 

In [50]:
import numpy as np
np.mean(cv_accuracy)

0.9600200000000001