In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

In [2]:
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

**Hold out cross validation without hyperpameters tuning**

Hold-out is when you split up your dataset into a ‘train’ and ‘test’ set. The training set is what the model is trained on, and the test set is used to see how well that model performs on unseen data.

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [4]:
X_train.shape, y_train.shape

((90, 4), (90,))

In [5]:
X_test.shape, y_test.shape

((60, 4), (60,))

In [6]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

In [7]:
y_test

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2])

**5-fold cross validation without hyperapameter tuning**

Cross-validation or ‘k-fold cross-validation’ is when the dataset is randomly split up into ‘k’ groups. One of the groups is used as the test set and the rest are used as the training set. The model is trained on the training set and scored on the test set. Then the process is repeated until each unique group has been used as the test set.

In [8]:
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.96666667 1.         0.96666667 0.96666667 1.        ]
0.98 accuracy with a standard deviation of 0.02


if I want to change the scoring metric

In [9]:
from sklearn import metrics
scores = cross_val_score(
    clf, X, y, cv=5, scoring='f1_macro')
#F1 score: harmonic mean of precision and recall. Macro: computed using the arithmetic mean (aka unweighted mean) of all the per-class F1 scores.
scores

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

**If I want to pre-process my data...**

`StandardScaler()` standardizes features by removing the mean and scaling to unit variance.

The standard score of a sample $x$ is calculated as:

$z = (x - u) / s$

where $u$ is the _mean_ of the training samples, and $s$ is the _standard deviation_ of the training samples.

In [10]:
from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)

clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

0.9333333333333333

In [11]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

pca = PCA(n_components=3) #here I have to choose the number of PCA components to keep
pca.fit(X_train)
X_train_red = pca.transform(X_train)
X_test_red = pca.transform(X_test)

print(X_train_red.shape)
print(X_test_red.shape)


(90, 3)
(60, 3)


In [12]:
clf = svm.SVC(C=1).fit(X_train_red, y_train)
print(clf.score(X_test_red, y_test))

0.9666666666666667


**Using pipelines to make things more compact:**

It sequentially applies a list of transforms and a final estimator

In [13]:
from sklearn.pipeline import make_pipeline

clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, X, y, cv=5)

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

k-fold CV

In [14]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
kf = KFold(n_splits=2)

for train, test in kf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]


repeated k-fold CV

In [15]:
import numpy as np
from sklearn.model_selection import RepeatedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
random_state = 12883823
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)

for train, test in rkf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]
[0 2] [1 3]
[1 3] [0 2]


Leave One Out (LOO) CV

In [16]:
from sklearn.model_selection import LeaveOneOut

X = [1, 2, 3, 4]
loo = LeaveOneOut()

for train, test in loo.split(X):
    print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


Leave P Out (LPO)

In [17]:
from sklearn.model_selection import LeavePOut

X = np.ones(5)
lpo = LeavePOut(p=2) #leave 2 samples out

for train, test in lpo.split(X):
    print("%s %s" % (train, test))

[2 3 4] [0 1]
[1 3 4] [0 2]
[1 2 4] [0 3]
[1 2 3] [0 4]
[0 3 4] [1 2]
[0 2 4] [1 3]
[0 2 3] [1 4]
[0 1 4] [2 3]
[0 1 3] [2 4]
[0 1 2] [3 4]


Suffle and split

In [18]:
from sklearn.model_selection import ShuffleSplit

X = np.arange(10)
ss = ShuffleSplit(n_splits=10, test_size=0.25, random_state=0) #n_splits = Number of re-shuffling & splitting iterations.

for train_index, test_index in ss.split(X):
    print("%s %s" % (train_index, test_index))

[9 1 6 7 3 0 5] [2 8 4]
[2 9 8 0 6 7 4] [3 5 1]
[4 5 1 0 6 9 7] [2 3 8]
[2 7 5 8 0 3 4] [6 1 9]
[4 1 0 6 8 9 3] [5 2 7]
[5 2 6 3 7 4 0] [1 8 9]
[7 8 6 5 4 9 0] [3 2 1]
[3 5 6 7 1 8 4] [0 9 2]
[1 4 5 3 6 0 2] [7 8 9]
[0 8 3 7 9 6 4] [1 2 5]


Stratified k-fold

In [19]:
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np

X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
#np.ones: return a new array of given shape and type, filled with ones
#np.hstack: stack arrays in sequence horizontally (column wise).

Here is an example of stratified 3-fold cross-validation on a dataset with 50 samples from two unbalanced classes.

We show the number of samples in each class and compare with KFold.

`StratifiedKFold` stratifies the data such that each fold has approximately the same class distribution as the entire dataset. It is essential for classification problems with imbalanced classes because helps ensure that each train/test split is representative of the full dataset.

In [20]:
skf = StratifiedKFold(n_splits=3)

for train, test in skf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))

print("---")

kf = KFold(n_splits=3)
for train, test in kf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))

train -  [30  3]   |   test -  [15  2]
train -  [30  3]   |   test -  [15  2]
train -  [30  4]   |   test -  [15  1]
---
train -  [28  5]   |   test -  [17]
train -  [28  5]   |   test -  [17]
train -  [34]   |   test -  [11  5]
