### Курс "Машинное обучение"

### Тема занятия: Кросс-валидация

In [1]:
from platform import python_version

print(python_version())

3.8.8


In [12]:
l = [1,2,3,4,5]

print(l[::1])


[1, 2, 3, 4, 5]


# Сравнение случайного поиска и поиска по сетке для поиска лучшего набора гиперпараметров

## Вспомогательные функции

### Разбиение базы на обучающую и тестовую

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets

X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) 
print('X.shape, y.shape: ', X.shape, y.shape)
print('X_train.shape, y_train.shape, X_test.shape, y_test.shape: ', X_train.shape, y_train.shape, X_test.shape, y_test.shape)
y

X.shape, y.shape:  (150, 4) (150,)
X_train.shape, y_train.shape, X_test.shape, y_test.shape:  (90, 4) (90,) (60, 4) (60,)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Посмотрим на качество работы на k блоках в k-fold кросс-валидации

In [2]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel='linear', C=1)
score = cross_val_score(clf, X_train, y_train, cv=5) 
print('Validation score on folds: ', score)

Validation score on folds:  [1.         1.         1.         1.         0.94444444]


### Также можно посмотреть на результат классификации на каждом примере

Обратите внимание, что каждый пример входит ровно в один тестовый под-блок (и вот на нем и выводятся результаты)

In [None]:
from sklearn.model_selection import cross_val_predict

clf = svm.SVC(kernel='linear', C=1)
classes_pred = cross_val_predict(clf, X_train, y_train, cv=5) 
print('Class prediction: ', classes_pred)
print('Total accuracy: ', np.sum(classes_pred == y_train) / y_train.shape[0])
print('Total #examples', classes_pred.shape)

### Генератор (на самом деле, итератор) разбиений для k-fold кросс-валидации

In [None]:
from sklearn.model_selection import KFold

X1 = np.array(["a", "b", "c", "d"])
kf = KFold(n_splits=2)
for train, test in kf.split(X1):
    print("train_idx: %s test_idx: %s" % (train, test))
    print(f"train_el: {X1[train]} test_el: {X1[test]}")

### Генератор разбиений для LOO кросс-валидации

In [None]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
for train, test in loo.split(X1):
    print("train_idx: %s test_idx: %s" % (train, test))
    print(f"train_el: {X1[train]} test_el: {X1[test]}")

### Генератор разбиений для многократной k-fold кросс-валидации

In [None]:
from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=123456)
for train, test in rkf.split(X1):
    print("train_idx: %s test_idx: %s" % (train, test))
    print(f"train_el: {X1[train]} test_el: {X1[test]}")

## Поиск гиперпараметров

### Поиск по сетке

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1.0, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid, cv=5, refit=True) # Refit an estimator using the best found parameters on the whole dataset
clf.fit(X_train, y_train) # Run CV grid search
print('Best params: ', clf.best_params_)
print('Validation score: ', clf.best_score_)
score = clf.score(X_test, y_test) # Test on the best parameters
print('Test score: ', score)

### Случайный поиск по распределению

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_dist = {'C': loguniform(0.1, 1000), 'gamma': loguniform(0.0001, 1.0), 'kernel': ['rbf']}
svc = svm.SVC()
clf = RandomizedSearchCV(svc, param_dist, n_iter=25, cv=5, refit=True) # Refit an estimator using the best found parameters on the whole dataset
clf.fit(X_train, y_train) # Run CV grid search
print('Best params: ', clf.best_params_)
print('Validation score: ', clf.best_score_)
score = clf.score(X_test, y_test) # Test on the best parameters
print('Test score: ', score)

### Сравнение подходов

Загрузим более представительную базу данных

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import datasets
from sklearn import svm
from scipy.stats import loguniform

X, y = datasets.load_digits(return_X_y=True) # X.shape = (150, 4), y.shape = (150,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123456) 
print('X.shape, y.shape: ', X.shape, y.shape)
print('X_train.shape, y_train.shape, X_test.shape, y_test.shape: ', X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Запускаем поиск по сетке размерности 9*9 = 81

In [None]:
import time # let's find the elapsed time

param_grid = {'C': [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000], 
              'gamma': [1.0, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001], 'kernel': ['rbf']}
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid, cv=5, refit=True) # Refit an estimator using the best found parameters on the whole dataset
start = time.time()
clf.fit(X_train, y_train) # Run CV grid search
elapsed = time.time() - start # time consumed
print('GridSearchCV params:', clf.best_params_)
score = clf.score(X_test, y_test) # Test on the best parameters
print('GridSearchCV test score:', score)
print(f'Time: {elapsed/60} min')

Запускаем случайный поиск по лог-равномерному распределению (так же, как в сетке), всего 81 раз

In [None]:
param_dist = {'C': loguniform(0.1, 1000), 'gamma': loguniform(0.0001, 1.0), 'kernel': ['rbf']}
svc = svm.SVC()
clf = RandomizedSearchCV(svc, param_dist, n_iter=9*9, cv=5, random_state=123456, refit=True) # Refit an estimator using the best found parameters on the whole dataset
start = time.time()
clf.fit(X_train, y_train) # Run CV grid search
elapsed = time.time() - start # time consumed
print('RandomizedSearchCV params:', clf.best_params_)
score = clf.score(X_test, y_test) # Test on the best parameters
print('RandomizedSearchCV test score:', score)
print(f'Time: {elapsed/60} min')

Как видим, получилось даже лучше.

Теперь сэкономим время, и запустим всего 10 раз

In [None]:
param_dist = {'C': loguniform(0.1, 1000), 'gamma': loguniform(0.0001, 1.0), 'kernel': ['rbf']}
svc = svm.SVC()
clf = RandomizedSearchCV(svc, param_dist, n_iter=10, cv=5, random_state=123456, refit=True) # Refit an estimator using the best found parameters on the whole dataset
start = time.time()
clf.fit(X_train, y_train) # Run CV grid search
elapsed = time.time() - start # time consumed
print('RandomizedSearchCV params:', clf.best_params_)
score = clf.score(X_test, y_test) # Test on the best parameters
print('RandomizedSearchCV test score:', score)
print(f'Time: {elapsed/60} min')

**Вывод**: порой не стоит скрупулезно проходить по полной сетке, можно сэкономить время и просэмплировать по распределению, потратив в разы меньше времени!

In [70]:
from sklearn.model_selection import KFold
import numpy as np

In [57]:
data = pd.read_csv('train.csv')  
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [114]:
import numpy as np
from sklearn.model_selection import KFold


kf = KFold(n_splits=2, shuffle=True)
for learn, test in kf.split(data):
    train_1 = learn
    train_2 = test
    

frame_1 = pd.DataFrame(data.loc[train_1])
frame_2 = pd.DataFrame(data.loc[train_2])
frame_1.to_csv('train_1.csv',index=False)
frame_2.to_csv('val_1.csv',index=False)
frame_1.to_csv('val_2.csv',index=False)
frame_2.to_csv('train_2.csv',index=False)

