In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [22]:
iris = load_iris()

In [23]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [24]:
#uses stratified sampling by default. i.e., StratifiedKFold
lg = LogisticRegression(max_iter=400)
scores = cross_val_score(lg, iris.data, iris.target, cv=3)
for scores in scores:
    print(scores)
print("Average cross-validation:", scores.mean())

0.98
0.96
0.98
Average cross-validation: 0.98


In [25]:
#without stratification
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
scores = cross_val_score(lg, iris.data, iris.target, cv=kf)
for score in scores:
    print(score)
print("Average cross-validation score:", scores.mean())

0.0
0.0
0.0
Average cross-validation score: 0.0


In [26]:
import sklearn
sklearn.__version__

'1.0.2'

### Leave one out cross-validation

In [29]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(lg, iris.data, iris.target, cv=loo)
print(scores)
print("Number of cv iterations: ", len(scores))
print("Mean accuracy:", scores.mean())

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
Number of cv iterations:  150
Mean accuracy: 0.9666666666666667


In [31]:
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.3, train_size=.5, n_splits=10)
scores = cross_val_score(lg, iris.data, iris.target, cv=shuffle_split)
for score in scores:
    print(score)

0.9777777777777777
0.9555555555555556
0.9555555555555556
0.9555555555555556
0.9111111111111111
0.9333333333333333
0.9111111111111111
0.9777777777777777
0.9777777777777777
1.0


##### scikit-learn by default uses StratifiedkFold for classification and KFold for regression. StratifiedkFold is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class. Although Scikit learn defines a StratifiedKFold class, you dont need to use it because it is used by default for classification problems.

### Cross-validation with Groups

In [32]:
from sklearn.model_selection import GroupKFold
from sklearn.datasets import make_blobs
gkf = GroupKFold(n_splits=3)
X, y = make_blobs(n_samples=12, random_state=0)
groups = [0,0,0,1,1,1,1,2,2,3,3,3]
scores = cross_val_score(lg, X, y, groups = groups, cv=gkf)
print("Cross-validation scores:", scores)

Cross-validation scores: [0.75       0.6        0.66666667]
