In [29]:
import numpy as np  
from sklearn.model_selection import KFold 

X = np.array([[1,2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
print(kf.get_n_splits(X))
print(kf)
for train_idx, test_idx in kf.split(X):
    print('--- idx')
    print(train_idx, test_idx)
    print(X[train_idx])
    print('---val data')
    print(X[test_idx])

2
KFold(n_splits=2, random_state=None, shuffle=False)
--- idx
[2 3] [0 1]
[[1 2]
 [3 4]]
---val data
[[1 2]
 [3 4]]
--- idx
[0 1] [2 3]
[[1 2]
 [3 4]]
---val data
[[1 2]
 [3 4]]


In [30]:
import pandas as pd  
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial' +\
                                        '/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial' +\
                                        '/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep = ';')
white_wine = pd.read_csv(white_url, sep = ';')
red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])

In [31]:
wine['taste'] = [1 if grade >5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis = 1);
y = wine['taste']


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)
wine_tree = DecisionTreeClassifier(max_depth = 2, random_state = 13)
wine_tree.fit(X_train, y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

Train Acc :  0.7294593034442948
Test Acc :  0.7161538461538461


In [33]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 5)
wine_tree_cv = DecisionTreeClassifier(max_depth = 2, random_state = 13)

In [34]:
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [35]:
cv_accuracy = []
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [36]:
np.mean(cv_accuracy)

0.709578255462782

## StratifiedKFold

In [37]:
from sklearn.model_selection import StratifiedKFold 
skfold = StratifiedKFold(n_splits = 5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state = 13)

cv_accuracy = []
for train_idx, test_idx in skfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

In [38]:
np.mean(cv_accuracy)

0.6888004974240539

In [39]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits = 5)
wine_tree_cv = DecisionTreeClassifier(max_depth = 2, random_state=13)
cross_val_score(wine_tree_cv, X, y, scoring = None, cv = skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [40]:
wine_tree_cv = DecisionTreeClassifier(max_depth = 5, random_state = 13)
cross_val_score(wine_tree_cv, X, y, scoring = None, cv = skfold)

array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772])

In [41]:
from sklearn.model_selection import cross_validate 
cross_validate(wine_tree, X, y, scoring = None, cv = skfold, return_train_score = True)

{'fit_time': array([0.00716281, 0.00688624, 0.0062685 , 0.00640106, 0.00791812]),
 'score_time': array([0.00154161, 0.00193167, 0.00158   , 0.00157475, 0.00217652]),
 'test_score': array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595]),
 'train_score': array([0.74773908, 0.74696941, 0.74317045, 0.73509042, 0.73258946])}