# Train Test Frameworks
These are simple exercises for practicing syntax of various functions from sklearn that split data into the train and test sets. The goal is to get familiar with these methods prior to the more complex challenge at the end of the day.

In [6]:
# import numpy
import numpy as np

In [7]:
X = np.random.normal(0,1,20).reshape(10,2)
y = np.random.normal(0,1,10)

* print X

In [8]:
X

array([[-2.41301388,  0.74041969],
       [-0.46356324, -0.0060666 ],
       [ 0.97992978, -0.81761852],
       [ 0.18486767, -0.93222325],
       [ 1.02041508, -1.20280445],
       [ 0.13198978,  0.51058714],
       [-2.13940978,  1.2354469 ],
       [ 1.17391075, -0.30157023],
       [-0.81198027, -0.16358266],
       [ 0.63253608,  0.9780095 ]])

* print y

In [9]:
y

array([ 0.45490422, -0.06380328,  0.79564566,  0.23120931, -0.75901017,
       -0.67709686, -0.22553772, -1.68258679,  0.58374865,  0.1575569 ])

## HOLDOUT split

* import train_test_split function from sklearn

In [10]:
from sklearn.model_selection import train_test_split

* split the data to train set (70%) and test set (30%)

In [11]:
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.3, random_state = 0)

* print X_train

In [12]:
X_Train

array([[ 0.63253608,  0.9780095 ],
       [-0.46356324, -0.0060666 ],
       [-2.13940978,  1.2354469 ],
       [ 1.17391075, -0.30157023],
       [ 0.18486767, -0.93222325],
       [-2.41301388,  0.74041969],
       [ 0.13198978,  0.51058714]])

* split the data again but now with  shuttle=False

In [13]:
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.3, random_state = 0, shuffle=False)

* print X_train

In [14]:
X_Train

array([[-2.41301388,  0.74041969],
       [-0.46356324, -0.0060666 ],
       [ 0.97992978, -0.81761852],
       [ 0.18486767, -0.93222325],
       [ 1.02041508, -1.20280445],
       [ 0.13198978,  0.51058714],
       [-2.13940978,  1.2354469 ]])

* print shape of X_train and X_test

In [15]:
X_Train.shape

(7, 2)

## K-FOLD split 

In [16]:
from sklearn.model_selection import KFold

* import KFold from sklearn

In [17]:
kfold = KFold(5)

* instantiate KFold with k=5

In [18]:
for train, test in kfold.split(X):
    print('train: %s, test: %s' % (train, test))

train: [2 3 4 5 6 7 8 9], test: [0 1]
train: [0 1 4 5 6 7 8 9], test: [2 3]
train: [0 1 2 3 6 7 8 9], test: [4 5]
train: [0 1 2 3 4 5 8 9], test: [6 7]
train: [0 1 2 3 4 5 6 7], test: [8 9]


* iterate over train_index and test_index in kf.split(X) and print them

In [19]:
kfold = KFold(5, shuffle=True)

* instantiate KFold with k=5 and shuffle=True

In [20]:
for train, test in kfold.split(X):
    print('train: %s, test: %s' % (train, test))

train: [0 1 2 3 4 5 6 8], test: [7 9]
train: [0 1 2 3 4 7 8 9], test: [5 6]
train: [1 3 4 5 6 7 8 9], test: [0 2]
train: [0 1 2 3 5 6 7 9], test: [4 8]
train: [0 2 4 5 6 7 8 9], test: [1 3]


* iterate over train_index and test_index in kf.split(X) and print them

## LEAVE-ONE-OUT split
this is a similar technique like the Leave-p-out in the previous readings, with p=1. Each observation is used as test set separately.
- it's popular method for very small datasets
- takes a lot of time for bigger data and can lead to overfitting of a final model.

In [21]:
from sklearn.model_selection import LeaveOneOut

* import LeaveOneOut from sklearn

In [22]:
loo = LeaveOneOut()

* instantiate LeaveOneOut

In [24]:
for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [1 2 3 4 5 6 7 8 9] TEST: [0]
[[-0.46356324 -0.0060666 ]
 [ 0.97992978 -0.81761852]
 [ 0.18486767 -0.93222325]
 [ 1.02041508 -1.20280445]
 [ 0.13198978  0.51058714]
 [-2.13940978  1.2354469 ]
 [ 1.17391075 -0.30157023]
 [-0.81198027 -0.16358266]
 [ 0.63253608  0.9780095 ]] [[-2.41301388  0.74041969]] [-0.06380328  0.79564566  0.23120931 -0.75901017 -0.67709686 -0.22553772
 -1.68258679  0.58374865  0.1575569 ] [0.45490422]
TRAIN: [0 2 3 4 5 6 7 8 9] TEST: [1]
[[-2.41301388  0.74041969]
 [ 0.97992978 -0.81761852]
 [ 0.18486767 -0.93222325]
 [ 1.02041508 -1.20280445]
 [ 0.13198978  0.51058714]
 [-2.13940978  1.2354469 ]
 [ 1.17391075 -0.30157023]
 [-0.81198027 -0.16358266]
 [ 0.63253608  0.9780095 ]] [[-0.46356324 -0.0060666 ]] [ 0.45490422  0.79564566  0.23120931 -0.75901017 -0.67709686 -0.22553772
 -1.68258679  0.58374865  0.1575569 ] [-0.06380328]
TRAIN: [0 1 3 4 5 6 7 8 9] TEST: [2]
[[-2.41301388  0.74041969]
 [-0.46356324 -0.0060666 ]
 [ 0.18486767 -0.93222325]
 [ 1.02041508 -

* iterate over train_index and test_index in loo.split(X) and print them

In [23]:
loo.get_n_splits(X)

10

* print the number of splits