In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import set_config
from sklearn.datasets import load_digits
from sklearn.datasets import load_iris

set_config(print_changed_only=False)


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


<h1 style='color:blue' align='center'>KFold Cross Validation</h1>
<img src='Kfoldcrossvalidation.jpg' width=1000 height=600>


<a href="https://medium.com/analytics-vidhya/training-validation-and-test-set-in-machine-learning-7fab555c1080">Reference</a>

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()

In [3]:
X=digits.data
y=digits.target

X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target,test_size=0.3)

**Logistic Regression**

In [4]:
lr = LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

0.9555555555555556

**SVM**

In [5]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

0.3814814814814815

**Random Forest**

In [6]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=40, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

0.9777777777777777

<h2 style='color:blue'>KFold cross validation</h2>

In [7]:
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [8]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


**Use KFold for our digits example**

In [9]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [10]:
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(X, y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], \
                                       y[train_index], y[test_index]
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), 
                                     X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), 
                                X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), 
                               X_train, X_test, y_train, y_test))

In [11]:
scores_logistic
max(scores_logistic)

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]

0.9532554257095158

In [12]:
scores_svm
max(scores_svm)

[0.3806343906510851, 0.41068447412353926, 0.5125208681135225]

0.5125208681135225

In [13]:
scores_rf
max(scores_rf)

[0.9348914858096828, 0.9482470784641068, 0.9148580968280468]

0.9482470784641068

<h2 style='color:blue'>Cross Validation Score Function</h2>

In [14]:
# Logistic regression model performance using cross_val_score
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y, cv=3)

# svm model performance using cross_val_score
cross_val_score(SVC(gamma='auto'), X, y, cv=3)

# random forest performance using cross_val_score
cross_val_score(RandomForestClassifier(n_estimators=40), X, y, cv=3)

array([0.89482471, 0.95325543, 0.90984975])

array([0.38063439, 0.41068447, 0.51252087])

array([0.94991653, 0.95659432, 0.92320534])

cross_val_score uses stratifield kfold by default

<h2 style='color:blue'>Parameter tunning using k fold cross validation</h2>

In [15]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5), X, y, cv=10)
np.average(scores1)
# scores1.mean()

scores2 = cross_val_score(RandomForestClassifier(n_estimators=20), X, y, cv=10)
np.average(scores2)

scores3 = cross_val_score(RandomForestClassifier(n_estimators=30), X, y, cv=10)
np.average(scores3)

scores4 = cross_val_score(RandomForestClassifier(n_estimators=40), X, y, cv=10)
np.average(scores4)

scores5 = cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=10)
np.average(scores5)

scores6 = cross_val_score(RandomForestClassifier(n_estimators=100), X, y, cv=10)
np.average(scores6)

0.8759342023587834

0.9371166977032898

0.9443482309124767

0.9415859714463066

0.9482278088144008

0.9493451272501551

### Exercise

Use iris flower dataset from sklearn library and use cross_val_score against following
models to measure the performance of each. In the end figure out the model with best performance,
1. Logistic Regression
2. SVM
3. Decision Tree
4. Random Forest

In [16]:
iris=load_iris()

In [17]:
X=iris.data
y=iris.target

In [18]:
lr_score=cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y, cv=10)
lr_score.mean()

svc_score=cross_val_score(SVC(gamma='auto'), X, y, cv=10)
svc_score.mean()

dt_score=cross_val_score(DecisionTreeClassifier(), X, y, cv=10)
dt_score.mean()

rf_score=cross_val_score(RandomForestClassifier(n_estimators=40), X, y, cv=10)
rf_score.mean()

0.9533333333333334

0.9800000000000001

0.96

0.96

**Best score is from SVM: 0.9800000000000001**