## 運用scikit-learn API 實現K-fold分割資料

---

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

### 讀入資料

In [2]:
dataset = pd.read_csv(r'./D19_Social_Network_Ads.csv')
display(dataset)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0
...,...,...,...,...,...
395,15691863,Female,46.0,41000.0,1
396,15706071,Male,51.0,23000.0,1
397,15654296,Female,50.0,20000.0,1
398,15755018,Male,36.0,33000.0,0


### 取出訓練特徵與標註

In [3]:
X = dataset[['User ID', 'Gender', 'Age', 'EstimatedSalary']].values
Y = dataset['Purchased'].values

---

### 將訓練資料按照順序切割成10等分

In [4]:
kf = KFold(10)
kf.get_n_splits(X)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


### 將訓練資料隨機切割成10等分

In [5]:
kf = KFold(10, shuffle=True)
kf.get_n_splits(X)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)


---

### 取出 切割資料對應位置

In [6]:
train_split = kf.split(X)
next(train_split)

(array([  0,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  21,  23,  24,  25,  26,  27,  28,  29,  30,
         32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  43,  44,  45,
         46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,
         59,  60,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,
         74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,
         87,  88,  89,  90,  91,  93,  94,  95,  96,  97,  99, 100, 101,
        102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
        116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 160, 161, 162, 163, 164, 165, 166, 167, 169, 170, 171,
        172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 183, 184, 185,
        186, 187, 188, 189, 191, 192, 193, 194, 196

### Or

In [7]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "\nTEST:", test_index,
          sep = '\n',end = '\n'*4)

TRAIN:
[  0   1   3   4   5   6   7   8  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  25  26  27  28  30  31  32  33  34  35  36  37  38  40
  41  42  43  44  45  46  47  48  50  51  52  53  54  55  56  57  58  59
  61  62  63  64  65  66  67  69  70  71  72  73  74  75  76  77  78  79
  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97
  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
 117 118 119 120 121 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
 154 155 157 158 160 161 162 163 165 166 167 168 171 172 173 174 175 176
 177 178 179 181 182 183 184 185 186 187 188 189 190 191 192 193 195 196
 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 252
 253 254 255 256 257 258 260 261 263 264 266

### 取出切割資料：trainset / testset 特徵(x_train/x_test)/標註(y_train/y_test)

In [8]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    for i in ['train', 'test']:
        print(f'Data of {i}:')
        X_df = pd.DataFrame(globals()['X_'+i], columns=['User ID_X', 'Gender_X', 'Age_X', 'EstimatedSalary_X'])
        Y_df = pd.DataFrame(globals()['Y_'+i], columns=['Purchased_Y'])
        display(pd.merge(X_df, Y_df, how='left', left_index=True, right_index=True))
    print('\n')

Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15668575,Female,26,43000,0
2,15603246,Female,27,57000,0
3,15804002,Male,19,76000,0
4,15728773,Male,27,58000,0
...,...,...,...,...,...
355,15757632,Female,39,59000,0
356,15691863,Female,46,41000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15810944,Male,35,20000,0
1,15600575,Male,25,33000,0
2,15697686,Male,29,80000,0
3,15631159,Male,47,20000,1
4,15633531,Female,47,30000,1
5,15724858,Male,27,90000,0
6,15782806,Female,27,31000,0
7,15671655,Female,35,23000,0
8,15654901,Female,27,58000,0
9,15775562,Female,23,48000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15804002,Male,19,76000,0
4,15728773,Male,27,58000,0
...,...,...,...,...,...
355,15757632,Female,39,59000,0
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15603246,Female,27,57000,0
1,15570769,Female,26,80000,0
2,15798850,Male,25,79000,0
3,15595324,Female,31,68000,0
4,15774179,Female,22,27000,0
5,15711218,Male,35,88000,0
6,15636428,Female,30,116000,0
7,15675949,Female,33,149000,1
8,15672091,Female,19,21000,0
9,15801658,Male,21,72000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15728773,Male,27,58000,0
...,...,...,...,...,...
355,15691863,Female,46,41000,1
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15804002,Male,19,76000,0
1,15727311,Female,35,65000,0
2,15704583,Male,46,28000,1
3,15649487,Male,45,22000,1
4,15715541,Female,28,84000,0
5,15793813,Female,31,89000,0
6,15744919,Female,29,83000,0
7,15726358,Female,24,27000,0
8,15768816,Male,26,81000,0
9,15582492,Male,28,123000,1




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15810944,Male,35,20000,0
1,15668575,Female,26,43000,0
2,15603246,Female,27,57000,0
3,15804002,Male,19,76000,0
4,15598044,Female,27,84000,0
...,...,...,...,...,...
355,15691863,Female,46,41000,1
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15728773,Male,27,58000,0
2,15729054,Female,27,137000,1
3,15776348,Female,27,54000,0
4,15649136,Female,24,55000,0
5,15583681,Male,32,120000,1
6,15709476,Male,20,49000,0
7,15631912,Female,28,85000,0
8,15756820,Female,28,37000,0
9,15619653,Male,38,61000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15603246,Female,27,57000,0
3,15804002,Male,19,76000,0
4,15728773,Male,27,58000,0
...,...,...,...,...,...
355,15691863,Female,46,41000,1
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15668575,Female,26,43000,0
1,15598044,Female,27,84000,0
2,15694829,Female,32,150000,1
3,15606274,Female,26,52000,0
4,15705113,Male,46,23000,1
5,15671766,Female,26,72000,0
6,15639277,Male,23,20000,0
7,15814004,Male,27,20000,0
8,15592877,Male,32,18000,0
9,15667265,Female,28,87000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
355,15691863,Female,46,41000,1
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15704987,Male,32,18000,0
1,15714658,Male,48,41000,1
2,15713144,Male,35,27000,0
3,15690188,Female,33,28000,0
4,15764419,Female,27,17000,0
5,15772798,Male,35,108000,0
6,15792008,Male,30,15000,0
7,15727696,Male,30,135000,1
8,15595228,Female,20,23000,0
9,15782530,Female,33,113000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
355,15691863,Female,46,41000,1
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15628972,Male,18,82000,0
1,15617482,Male,45,26000,1
2,15736760,Female,47,49000,1
3,15599081,Female,45,22000,1
4,15669656,Male,31,18000,0
5,15573452,Female,21,16000,0
6,15776733,Female,28,44000,0
7,15689425,Male,30,49000,0
8,15591915,Female,33,51000,0
9,15642885,Male,22,18000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
355,15757632,Female,39,59000,0
356,15691863,Female,46,41000,1
357,15706071,Male,51,23000,1
358,15654296,Female,50,20000,1


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15792818,Male,49,28000,1
1,15744529,Male,29,43000,0
2,15577178,Female,22,63000,0
3,15595917,Male,30,80000,0
4,15684801,Male,22,81000,0
5,15809823,Male,26,15000,0
6,15697997,Female,38,80000,0
7,15574305,Male,35,53000,0
8,15697020,Female,39,61000,0
9,15800061,Female,28,55000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
355,15635893,Male,60,42000,1
356,15757632,Female,39,59000,0
357,15691863,Female,46,41000,1
358,15706071,Male,51,23000,1


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15746139,Male,20,86000,0
1,15733883,Male,47,25000,1
2,15581198,Male,31,74000,0
3,15764195,Female,18,44000,0
4,15605000,Female,59,83000,0
5,15756932,Male,25,80000,0
6,15655123,Female,26,17000,0
7,15786993,Female,29,83000,0
8,15706185,Female,26,35000,0
9,15660200,Female,37,71000,0




Data of train:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
355,15691863,Female,46,41000,1
356,15706071,Male,51,23000,1
357,15654296,Female,50,20000,1
358,15755018,Male,36,33000,0


Data of test:


Unnamed: 0,User ID_X,Gender_X,Age_X,EstimatedSalary_X,Purchased_Y
0,15621083,Female,48,29000,1
1,15694395,Female,24,32000,0
2,15807481,Male,28,79000,0
3,15673619,Male,25,87000,0
4,15595135,Female,23,66000,0
5,15679760,Male,24,19000,0
6,15654574,Female,23,82000,0
7,15682268,Male,35,50000,0
8,15593014,Male,28,59000,0
9,15789863,Male,27,89000,0




