# Sklearn

## sklearn.cross_validation (model_selection from 0.18)

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, 
                                                                                    iris.target, 
                                                                                    test_size=0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных

len(test_labels) / (len(train_labels) + len(test_labels))

0.3

In [5]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов' \
      .format(len(train_data), len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print('Обучающая выборка:\n', train_data[:5])
print('\nТестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[ 6.3  2.8  5.1  1.5]
 [ 6.7  3.1  5.6  2.4]
 [ 7.2  3.6  6.1  2.5]
 [ 5.7  2.8  4.5  1.3]
 [ 5.1  3.5  1.4  0.3]]

Тестовая выборка:
 [[ 7.1  3.   5.9  2.1]
 [ 5.4  3.   4.5  1.5]
 [ 5.1  3.4  1.5  0.2]
 [ 7.9  3.8  6.4  2. ]
 [ 5.   3.5  1.6  0.6]]


In [7]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\nМетки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [2 2 2 1 0 2 1 1 2 2 2 2 1 2 1 0 0 0 2 1 2 1 1 1 0 0 1 2 1 2 0 1 2 1 2 0 2
 0 2 2 1 0 2 0 2 1 0 0 1 1 0 2 2 1 1 1 0 1 0 0 2 0 2 1 0 2 2 1 0 0 0 1 0 0
 2 0 2 1 2 2 0 1 2 1 0 0 0 0 1 0 2 2 0 2 0 1 1 0 1 2 0 0 1 1 2]

Метки классов на тестовой выборке:
 [2 1 0 2 0 1 0 1 1 2 2 2 1 1 1 2 2 0 1 0 1 2 0 2 0 1 1 0 2 0 1 1 1 0 1 2 1
 0 1 2 2 2 0 0 0]


### Стратегии проведения кросс-валидации

#### KFold

In [8]:
A = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
kf = model_selection.KFold(n_splits=5, random_state=0)

In [9]:
for train_indices, test_indices in kf.split(A):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [10]:
kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=0)
for train_indices, test_indices in kf.split(A):
    print(train_indices, test_indices)

[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 5 6 7 8] [9]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 4 5 6 7 8 9] [3]
[1 2 3 4 5 6 7 8 9] [0]
[0 1 2 3 4 6 7 8 9] [5]


#### StratifiedKFold

In [11]:
# the percentage of samples for each class is equal
target = np.array([0] * 5 + [1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_indices, test_indices in skf.split(target, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [12]:
target = np.array([0, 1] * 5)
print(target)

for train_indices, test_indices in skf.split(target, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[6 7 8 9] [0 1 2 3 4 5]
[0 1 2 3 4 5] [6 7 8 9]


#### ShuffleSplit

In [13]:
ss = model_selection.ShuffleSplit(n_splits=10, test_size=0.2)

for train_indices, test_indices in ss.split(range(2, 21, 2)):
    print(train_indices, test_indices)

[2 8 1 9 3 4 5 7] [0 6]
[4 6 7 8 1 3 9 2] [5 0]
[5 7 9 8 4 1 3 0] [2 6]
[7 5 0 3 1 8 9 4] [2 6]
[0 7 8 4 3 2 5 1] [6 9]
[6 3 5 7 1 0 8 9] [4 2]
[5 3 9 1 4 7 8 2] [0 6]
[7 0 3 8 5 4 9 6] [2 1]
[0 2 4 8 3 6 5 7] [1 9]
[5 6 7 3 9 2 1 8] [0 4]


#### StratifiedShuffleSplit

In [14]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits=4, test_size=0.2)

for train_indices, test_indices in sss.split(target, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[7 4 0 2 1 8 6 5] [3 9]
[8 1 5 0 4 2 6 7] [9 3]
[6 8 4 2 0 7 1 5] [9 3]
[1 7 8 3 2 4 6 5] [9 0]


#### Leave-One-Out

In [15]:
loo = model_selection.LeaveOneOut()

for train_indices, test_indices in loo.split(range(2, 21, 2)):
    print(train_indices, test_indices)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators