# 教學目標]
學習使用 sklearn 中的 train_test_split 等套件，進行資料的切分 <br />
<br />
## [範例重點]
了解函數中各個參數的意義 <br />
<br />
引入我們需要的套件 <br />
<br />
# 用 numpy 生成隨機資料
<br />
# 使用 train_test_split 函數進行切分
請參考 train_test_split 函數的說明，了解函數裡的參數意義 <br />
<br />
test_size 一定只能小於 1 嗎？ <br />
random_state 不設置會怎麼樣呢？<br />
<br />
<br />
# 使用 K-fold Cross-validation 來切分資料<br />
請參考 kf 函數的說明，了解參數中的意義。K 可根據資料大小自行決定，K=5 是蠻常用的大小<br />
<br />
如果使用 shuffle=True 會怎麼樣?<br />

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold

x = np.arange(50).reshape(10,5)
y = np.zeros(10)

y[:5] = 1
print("Shape of X : ",x.shape)
print(x)
print('-' * 20)
print("Shape of Y : ",y.shape)
print(y)
print('-' * 20)

train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.33, random_state = 42)
print(train_x)
print(train_y)
print('-'*20)
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.33)
print(train_x)
print(train_y)
print('-'*20)

#If we didnt set the random state, then the splited out data will have large change that different every time.

kf = KFold(n_splits=5,shuffle = True)
i = 0

for train_index,test_index in kf.split(x):
    i += 1
    train_x,test_x = x[train_index],x[test_index]
    train_y,test_y = y[train_index],y[test_index]
    print("FOLD {} : " .format(i))
    print("train_index : ",train_index)
    print("test_index : ",test_index)
    print("x_test : ",test_x)
    print("y_test : ",test_y)
    print('-' * 30)
    
#If we turn on the shuffle in KFold, the output index will not follow the ascending order
#But every fold in data will still be trained and tested

Shape of X :  (10, 5)
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]
 [45 46 47 48 49]]
--------------------
Shape of Y :  (10,)
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
--------------------
[[35 36 37 38 39]
 [10 11 12 13 14]
 [45 46 47 48 49]
 [20 21 22 23 24]
 [15 16 17 18 19]
 [30 31 32 33 34]]
[0. 1. 0. 1. 1. 0.]
--------------------
[[45 46 47 48 49]
 [15 16 17 18 19]
 [25 26 27 28 29]
 [35 36 37 38 39]
 [30 31 32 33 34]
 [ 5  6  7  8  9]]
[0. 1. 0. 0. 0. 1.]
--------------------
FOLD 1 : 
train_index :  [0 1 2 3 4 5 8 9]
test_index :  [6 7]
x_test :  [[30 31 32 33 34]
 [35 36 37 38 39]]
y_test :  [0. 0.]
------------------------------
FOLD 2 : 
train_index :  [0 2 4 5 6 7 8 9]
test_index :  [1 3]
x_test :  [[ 5  6  7  8  9]
 [15 16 17 18 19]]
y_test :  [1. 1.]
------------------------------
FOLD 3 : 
train_index :  [0 1 3 4 5 6 7 8]
test_index :  [2 9]
x_test :  [[10 11 12 1

# [作業重點]
觀察函數說明，要如何切出固定大小的測試集? <br />
<br />
# 作業
假設我們資料中類別的數量並不均衡，在評估準確率時可能會有所偏頗，試著切分出 y_test 中，0 類別與 1 類別的數量是一樣的 (亦即 y_test 的類別是均衡的)<br />

In [2]:
x = np.arange(1000).reshape(200,5)
y = np.zeros(200)
y[:40] = 1
print(y)


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:
train_x_0,test_x_0,train_y_0,test_y_0 = train_test_split(x[:40],y[:40],test_size = 10)
train_x_1,test_x_1,train_y_1,test_y_1 = train_test_split(x[40:],y[40:],test_size = 10)

train_x = np.concatenate((train_x_0,train_x_1), axis = 0)
train_y = np.concatenate((train_y_0,train_y_1), axis = 0)
test_x = np.concatenate((test_x_0,test_x_1), axis = 0)
test_y = np.concatenate((test_y_0,test_y_1), axis = 0)

test_y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [9]:
train_y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])