# Train Test Frameworks
These are simple exercises for practicing syntax of various functions from sklearn that split data into the train and test sets. The goal is to get familiar with these methods prior to the more complex challenge at the end of the day.

In [1]:
# import numpy
import numpy as np

In [2]:
X = np.random.normal(0,1,20).reshape(10,2)
y = np.random.normal(0,1,10)

* print X

In [3]:
X

array([[-0.19220936, -1.2245809 ],
       [-0.48289099,  0.00604015],
       [-0.12391915, -1.50276689],
       [-0.37498114, -0.55701297],
       [-0.22820231,  1.01622982],
       [ 0.70375012, -0.93611587],
       [ 1.36975425, -0.14032562],
       [-1.26225734,  0.05790936],
       [ 2.11085287, -0.71907783],
       [-0.59610475, -0.55031929]])

* print y

In [4]:
y

array([ 1.22857262,  0.45721824,  2.27974784,  1.22007556,  0.6826513 ,
       -0.95282021,  0.18921292,  0.44718762,  1.00460908, -1.19995924])

## HOLDOUT split

* import train_test_split function from sklearn

In [5]:
from sklearn.model_selection import train_test_split

* split the data to train set (70%) and test set (30%)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.7,test_size=0.3)

print ("X_train: ", X_train)
print ("y_train: ", y_train)
print("X_test: ", X_test)
print ("y_test: ", y_test)

X_train:  [[ 0.70375012 -0.93611587]
 [ 2.11085287 -0.71907783]
 [-0.59610475 -0.55031929]
 [ 1.36975425 -0.14032562]
 [-1.26225734  0.05790936]
 [-0.19220936 -1.2245809 ]
 [-0.12391915 -1.50276689]]
y_train:  [-0.95282021  1.00460908 -1.19995924  0.18921292  0.44718762  1.22857262
  2.27974784]
X_test:  [[-0.22820231  1.01622982]
 [-0.48289099  0.00604015]
 [-0.37498114 -0.55701297]]
y_test:  [0.6826513  0.45721824 1.22007556]


* print X_train

In [10]:
X_train

array([[ 0.70375012, -0.93611587],
       [ 2.11085287, -0.71907783],
       [-0.59610475, -0.55031929],
       [ 1.36975425, -0.14032562],
       [-1.26225734,  0.05790936],
       [-0.19220936, -1.2245809 ],
       [-0.12391915, -1.50276689]])

* split the data again but now with  shuttle=False

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.7,test_size=0.3,shuffle=False)

* print X_train

In [13]:
#essentially takes the first 70% of all values 
X_train

array([[-0.19220936, -1.2245809 ],
       [-0.48289099,  0.00604015],
       [-0.12391915, -1.50276689],
       [-0.37498114, -0.55701297],
       [-0.22820231,  1.01622982],
       [ 0.70375012, -0.93611587],
       [ 1.36975425, -0.14032562]])

* print shape of X_train and X_test

In [14]:
print(X_train.shape) 
print(X_test.shape)

(7, 2)
(3, 2)


## K-FOLD split 

* import KFold from sklearn

In [15]:
from sklearn.model_selection import KFold

* instantiate KFold with k=5

In [16]:
kf = KFold(n_splits=5)

* iterate over train_index and test_index in kf.split(X) and print them

In [17]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("X_test: ", X_test)

X_test:  [[-0.19220936 -1.2245809 ]
 [-0.48289099  0.00604015]]
X_test:  [[-0.12391915 -1.50276689]
 [-0.37498114 -0.55701297]]
X_test:  [[-0.22820231  1.01622982]
 [ 0.70375012 -0.93611587]]
X_test:  [[ 1.36975425 -0.14032562]
 [-1.26225734  0.05790936]]
X_test:  [[ 2.11085287 -0.71907783]
 [-0.59610475 -0.55031929]]


* instantiate KFold with k=5 and shuffle=True

In [18]:
kf = KFold(n_splits=5,shuffle=True)

* iterate over train_index and test_index in kf.split(X) and print them

In [19]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("X_test: ", X_test)

X_test:  [[ 0.70375012 -0.93611587]
 [ 1.36975425 -0.14032562]]
X_test:  [[-0.48289099  0.00604015]
 [-0.37498114 -0.55701297]]
X_test:  [[-0.22820231  1.01622982]
 [-1.26225734  0.05790936]]
X_test:  [[-0.12391915 -1.50276689]
 [-0.59610475 -0.55031929]]
X_test:  [[-0.19220936 -1.2245809 ]
 [ 2.11085287 -0.71907783]]


## LEAVE-ONE-OUT split
this is a similar technique like the Leave-p-out in the previous readings, with p=1. Each observation is used as test set separately.
- it's popular method for very small datasets
- takes a lot of time for bigger data and can lead to overfitting of a final model.

* import LeaveOneOut from sklearn

In [20]:
from sklearn.model_selection import LeaveOneOut

* instantiate LeaveOneOut

In [21]:
loo = LeaveOneOut()

* iterate over train_index and test_index in loo.split(X) and print them

In [24]:
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html?highlight=leaveoneout#sklearn.model_selection.LeaveOneOut
for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [1 2 3 4 5 6 7 8 9] TEST: [0]
[[-0.48289099  0.00604015]
 [-0.12391915 -1.50276689]
 [-0.37498114 -0.55701297]
 [-0.22820231  1.01622982]
 [ 0.70375012 -0.93611587]
 [ 1.36975425 -0.14032562]
 [-1.26225734  0.05790936]
 [ 2.11085287 -0.71907783]
 [-0.59610475 -0.55031929]] [[-0.19220936 -1.2245809 ]] [ 0.45721824  2.27974784  1.22007556  0.6826513  -0.95282021  0.18921292
  0.44718762  1.00460908 -1.19995924] [1.22857262]
TRAIN: [0 2 3 4 5 6 7 8 9] TEST: [1]
[[-0.19220936 -1.2245809 ]
 [-0.12391915 -1.50276689]
 [-0.37498114 -0.55701297]
 [-0.22820231  1.01622982]
 [ 0.70375012 -0.93611587]
 [ 1.36975425 -0.14032562]
 [-1.26225734  0.05790936]
 [ 2.11085287 -0.71907783]
 [-0.59610475 -0.55031929]] [[-0.48289099  0.00604015]] [ 1.22857262  2.27974784  1.22007556  0.6826513  -0.95282021  0.18921292
  0.44718762  1.00460908 -1.19995924] [0.45721824]
TRAIN: [0 1 3 4 5 6 7 8 9] TEST: [2]
[[-0.19220936 -1.2245809 ]
 [-0.48289099  0.00604015]
 [-0.37498114 -0.55701297]
 [-0.22820231  1

* print the number of splits