In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import f1_score

In [92]:
iris = datasets.load_iris()
X, y = iris.data, iris.target 

In [93]:
rf = RandomForestClassifier()

# Simple cross_val_score

In [94]:
from sklearn.model_selection import cross_val_score

In [95]:
scores = cross_val_score(rf, X, y, cv=3, scoring='f1_micro')
print(scores)
print(np.average(scores))

[0.98039216 0.90196078 0.97916667]
0.9538398692810457




# Iterators

## KFold

In [96]:
from sklearn.model_selection import KFold

In [99]:
NSPLITS = 5
kf = KFold(n_splits=NSPLITS)
tr_score, test_score = 0, 0
for train_idx, test_idx in kf.split(X):
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/NSPLITS} \nTrain Score: {tr_score/NSPLITS}")    

Test Score: 0.9066666666666666 
Train Score: 0.9949999999999999


## Repeated KFold

In [71]:
from sklearn.model_selection import RepeatedKFold

In [100]:
NSPLITS = 3
NREPEATS = 2
kf = RepeatedKFold(n_splits = NSPLITS, n_repeats = NREPEATS)
tr_score, test_score = 0, 0
i = 0
for train_idx, test_idx in kf.split(X):
    i += 1
    print(f"Fold: {i}")
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/(NSPLITS * NREPEATS)} \nTrain Score: {tr_score/(NSPLITS * NREPEATS)}")    

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Test Score: 0.94 
Train Score: 0.995


## Leave one out

In [101]:
from sklearn.model_selection import LeaveOneOut

In [105]:
loo = LeaveOneOut()
tr_score, test_score = 0, 0
i = 0
for train_idx, test_idx in loo.split(X):
    i += 1
    print(f"Fold: {i}")
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/i} \nTrain Score: {tr_score/i}")    

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Fold: 11
Fold: 12
Fold: 13
Fold: 14
Fold: 15
Fold: 16
Fold: 17
Fold: 18
Fold: 19
Fold: 20
Fold: 21
Fold: 22
Fold: 23
Fold: 24
Fold: 25
Fold: 26
Fold: 27
Fold: 28
Fold: 29
Fold: 30
Fold: 31
Fold: 32
Fold: 33
Fold: 34
Fold: 35
Fold: 36
Fold: 37
Fold: 38
Fold: 39
Fold: 40
Fold: 41
Fold: 42
Fold: 43
Fold: 44
Fold: 45
Fold: 46
Fold: 47
Fold: 48
Fold: 49
Fold: 50
Fold: 51
Fold: 52
Fold: 53
Fold: 54
Fold: 55
Fold: 56
Fold: 57
Fold: 58
Fold: 59
Fold: 60
Fold: 61
Fold: 62
Fold: 63
Fold: 64
Fold: 65
Fold: 66
Fold: 67
Fold: 68
Fold: 69
Fold: 70
Fold: 71
Fold: 72
Fold: 73
Fold: 74
Fold: 75
Fold: 76
Fold: 77
Fold: 78
Fold: 79
Fold: 80
Fold: 81
Fold: 82
Fold: 83
Fold: 84
Fold: 85
Fold: 86
Fold: 87
Fold: 88
Fold: 89
Fold: 90
Fold: 91
Fold: 92
Fold: 93
Fold: 94
Fold: 95
Fold: 96
Fold: 97
Fold: 98
Fold: 99
Fold: 100
Fold: 101
Fold: 102
Fold: 103
Fold: 104
Fold: 105
Fold: 106
Fold: 107
Fold: 108
Fold: 109
Fold: 110
Fold: 11

In [107]:
len(X)

150

As expected, above runs 150 times


## Leave P Out (LPO)

In [108]:
from sklearn.model_selection import LeavePOut

In [110]:
lpo = LeavePOut(p=1)
tr_score, test_score = 0, 0
i = 0
for train_idx, test_idx in lpo.split(X):
    i += 1
    print(f"Fold: {i}")
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/i} \nTrain Score: {tr_score/i}")    

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Fold: 11
Fold: 12
Fold: 13
Fold: 14
Fold: 15
Fold: 16
Fold: 17
Fold: 18
Fold: 19
Fold: 20
Fold: 21
Fold: 22
Fold: 23
Fold: 24
Fold: 25
Fold: 26
Fold: 27
Fold: 28
Fold: 29
Fold: 30
Fold: 31
Fold: 32
Fold: 33
Fold: 34
Fold: 35
Fold: 36
Fold: 37
Fold: 38
Fold: 39
Fold: 40
Fold: 41
Fold: 42
Fold: 43
Fold: 44
Fold: 45
Fold: 46
Fold: 47
Fold: 48
Fold: 49
Fold: 50
Fold: 51
Fold: 52
Fold: 53
Fold: 54
Fold: 55
Fold: 56
Fold: 57
Fold: 58
Fold: 59
Fold: 60
Fold: 61
Fold: 62
Fold: 63
Fold: 64
Fold: 65
Fold: 66
Fold: 67
Fold: 68
Fold: 69
Fold: 70
Fold: 71
Fold: 72
Fold: 73
Fold: 74
Fold: 75
Fold: 76
Fold: 77
Fold: 78
Fold: 79
Fold: 80
Fold: 81
Fold: 82
Fold: 83
Fold: 84
Fold: 85
Fold: 86
Fold: 87
Fold: 88
Fold: 89
Fold: 90
Fold: 91
Fold: 92
Fold: 93
Fold: 94
Fold: 95
Fold: 96
Fold: 97
Fold: 98
Fold: 99
Fold: 100
Fold: 101
Fold: 102
Fold: 103
Fold: 104
Fold: 105
Fold: 106
Fold: 107
Fold: 108
Fold: 109
Fold: 110
Fold: 11

Produces nCp folds where n is no. of observations and p is the parameter passed

## Shuffle Split

In [111]:
from sklearn.model_selection import ShuffleSplit

In [112]:
ss = ShuffleSplit(n_splits = 5, test_size = 0.3)
tr_score, test_score = 0, 0
i = 0
for train_idx, test_idx in ss.split(X):
    i += 1
    print(f"Fold: {i}")
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/i} \nTrain Score: {tr_score/i}")    

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Test Score: 0.9377777777777778 
Train Score: 0.9942857142857143


## Stratified Kfold
Preserves the class balance in splits. Use this for classification problems.

In [113]:
from sklearn.model_selection import StratifiedKFold

In [116]:
sf = StratifiedKFold(n_splits = 3)
tr_score, test_score = 0, 0
i = 0
for train_idx, test_idx in sf.split(X, y):
    i += 1
    print(f"Fold: {i}")
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/i} \nTrain Score: {tr_score/i}")    

Fold: 1
Fold: 2
Fold: 3
Test Score: 0.960375816993464 
Train Score: 0.9966329966329966


Because the splits are made while balancing the class, this gives pretty good score. Better than any of the ones above.

## Stratified Shuffle Split

In [117]:
from sklearn.model_selection import StratifiedShuffleSplit

In [118]:
ssf = StratifiedKFold(n_splits = 3)
tr_score, test_score = 0, 0
i = 0
for train_idx, test_idx in ssf.split(X, y):
    i += 1
    print(f"Fold: {i}")
    Xtr, ytr = X[train_idx], y[train_idx]
    Xtest, ytest = X[test_idx], y[test_idx]
    rf.fit(Xtr, ytr)
    tr_score += f1_score(rf.predict(Xtr), ytr, average='micro')
    test_score += f1_score(rf.predict(Xtest), ytest, average='micro')
print(f"Test Score: {test_score/i} \nTrain Score: {tr_score/i}")    

Fold: 1
Fold: 2
Fold: 3
Test Score: 0.9669117647058822 
Train Score: 0.9933650227767875
