# KFold CV calculation on the Digits Dataset

There are different type of cross-validation iterators. In followind scikit-learn page, you can find some of them,

http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators

In this tutorial, we will work with **KFold** cross validation iterator.

In [1]:
from sklearn.datasets import load_digits

digits = load_digits()

X = digits.data
y = digits.target

In [2]:
print(y)

[0 1 2 ..., 8 9 8]


In [3]:
type(X)

numpy.ndarray

**k** is defined in **n_folds** parameter:

In [4]:
from sklearn.cross_validation import KFold

kf_digits = KFold(1797, n_folds=5, shuffle=True)
print(kf_digits)

sklearn.cross_validation.KFold(n=1797, n_folds=5, shuffle=True, random_state=None)


In [5]:
for train_index, test_index in kf_digits:
    print("Len. Test:", len(test_index))

Len. Test: 360
Len. Test: 360
Len. Test: 359
Len. Test: 359
Len. Test: 359


As you see above, **KFold** generated **n_folds** different test sets with different features. That means, **KFold** shuffled first the entire data set and then divided them to **n_folds** test sets.

In [6]:
import numpy as np
from sklearn.svm import SVC
print(SVC())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [7]:
accuracy = []
for train_index, test_index in kf_digits:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = SVC().fit(X_train, y_train)

    i_correct = 0
    for i in range(len(X_test)):
        if (clf.predict(X_test)[i] == y_test[i]):
            i_correct += 1

    accuracy.append(i_correct*100/len(y_test))

print(accuracy)
print("Accuracy: %0.2f (+/- %0.2f)" %(np.mean(accuracy), np.sqrt(np.std(accuracy))))

[44.44444444444444, 40.55555555555556, 49.02506963788301, 63.788300835654596, 56.26740947075209]
Accuracy: 50.82 (+/- 2.89)


In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')


In [9]:
accuracy = []
for train_index, test_index in kf_digits:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = knn.fit(X_train, y_train)

    i_correct = 0
    for i in range(len(X_test)):
        if (clf.predict(X_test)[i] == y_test[i]):
            i_correct += 1

    accuracy.append(i_correct*100/len(y_test))

print(accuracy)
print("Accuracy: %0.2f (+/- %0.2f)" %(np.mean(accuracy), np.sqrt(np.std(accuracy))))

[97.22222222222223, 98.33333333333333, 98.05013927576601, 98.60724233983287, 98.32869080779945]
Accuracy: 98.11 (+/- 0.69)


In [10]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
print(dtc)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


In [11]:
accuracy = []
for train_index, test_index in kf_digits:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = dtc.fit(X_train, y_train)

    i_correct = 0
    for i in range(len(X_test)):
        if (clf.predict(X_test)[i] == y_test[i]):
            i_correct += 1

    accuracy.append(i_correct*100/len(y_test))

print(accuracy)
print("Accuracy: %0.2f (+/- %0.2f)" %(np.mean(accuracy), np.sqrt(np.std(accuracy))))

[83.33333333333333, 85.83333333333333, 84.67966573816156, 88.85793871866295, 85.79387186629526]
Accuracy: 85.70 (+/- 1.35)


In [12]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [13]:
accuracy = []
for train_index, test_index in kf_digits:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = gnb.fit(X_train, y_train)

    i_correct = 0
    for i in range(len(X_test)):
        if (clf.predict(X_test)[i] == y_test[i]):
            i_correct += 1

    accuracy.append(i_correct*100/len(y_test))

print(accuracy)
print("Accuracy: %0.2f (+/- %0.2f)" %(np.mean(accuracy), np.sqrt(np.std(accuracy))))

[81.94444444444444, 79.72222222222223, 88.85793871866295, 83.84401114206128, 84.958217270195]
Accuracy: 83.87 (+/- 1.75)
