In [72]:
## Cross Validation for binary classification problem using LOOCV

# https://machinelearningmastery.com/k-fold-cross-validation/
# https://machinelearningmastery.com/loocv-for-evaluating-machine-learning-algorithms/

import pandas as pd

# sonar data set
# sonar data set description
# https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.names
    
    
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'

df = pd.read_csv(url, header=None )

X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]


print('df shape', df.shape)
print('X shape', X.shape)
print('y shape', y.shape)



df shape (208, 61)
X shape (208, 60)
y shape (208,)


In [84]:
y = y.replace('R', 0)
y = y.replace('M', 1)

In [96]:
# cross validation using cross_val_score method

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
# loocv evaluate random forest on the sonar dataset

from numpy import mean
from numpy import std

classifier = RandomForestClassifier(random_state=1)
loo = LeaveOneOut()

scores = cross_val_score(classifier, X, y, scoring='accuracy', cv= loo, n_jobs=-1)
print(scores)
print('Accuracy of classification with LOO mean {} and standard deviation {}'.format(mean(scores), std(scores)) )


# Notes: In this case score is eaither 1 or 0. As each validation set is just one which either matches or not.

[0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Accuracy of classification with LOO mean 0.8317307692307693 and standard deviation 0.3741051947588567


In [99]:
from sklearn.model_selection import KFold 

classifier = RandomForestClassifier(random_state=1)
kfold = KFold(n_splits=10)

scores = cross_val_score(classifier, X, y, scoring='accuracy', cv= kfold, n_jobs=-1)
print(scores)
print('Accuracy of classification with kFold mean {} and standard deviation {}'.format(mean(scores), std(scores)) )

# Note in this case each validation set is multiple record hence the value is average of total right prediction 


[0.28571429 0.66666667 0.71428571 0.61904762 0.47619048 0.38095238
 0.52380952 0.47619048 0.7        0.5       ]
Accuracy of classification with kFold mean 0.5342857142857143 and standard deviation 0.1333809438805866


In [101]:
# while we have used exactly the same estimator in case of Kfold as LeaveOneOut accuracy is very less 0.53 vs 0.83
# This probably due to reduced training data set with K=10 training set is 10% smaller. 
# This may be a huge impact due to small data set
# Let's test this hypothesis through a small simulation 
# We should test accuracy of mode for multiple value of K. K = 208 is same as LOO.


ks = [3,5,10,50,100,208] 

accs = list()

for k in ks : 
    
    classifier = RandomForestClassifier(random_state=1)
    kfold = KFold(n_splits=k)
    scores = cross_val_score(classifier, X, y, scoring='accuracy', cv= kfold, n_jobs=-1)
    accs.append(mean(scores))
    
print(accs)

# we got accuracy [0.255, 0.417, 0.534, 0.77, 0.86, 0.83]
# note for K-208 accuracy is same as LOO.

[0.25507246376811593, 0.41707317073170735, 0.5342857142857143, 0.77, 0.86, 0.8317307692307693]


## LOOCV for Regression

In [2]:
#Data set to be used for this exercise is Housing price 
# https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv
# data description >  https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.names

import pandas as pd

df_h = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None) 
X = df_h.iloc[:,  0:-1]
y = df_h.iloc[:, -1]


print('df_hshape', df_h.shape)
print('X shape', X.shape)
print('y shape', y.shape)


df_hshape (506, 14)
X shape (506, 13)
y shape (506,)


In [14]:
# Leave One Out CV 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestRegressor
# loocv evaluate random forest on the hosuing dataset

from numpy import mean
from numpy import std

loo = LeaveOneOut()
regressor = RandomForestRegressor(random_state=1)
scores = cross_val_score(regressor, X, y, scoring='neg_mean_absolute_error', cv=loo, n_jobs=-1)

print(scores)
print('Accuracy of classification with LOO mean {} and standard deviation {}'.format(mean(scores), std(scores)) )


[-5.0300e+00 -1.2520e+00 -1.0500e-01 -1.1870e+00 -2.0980e+00 -5.1560e+00
 -2.7600e+00 -9.6870e+00 -2.0060e+00 -2.2500e-01 -5.9110e+00 -1.5930e+00
 -3.0700e-01 -6.3500e-01 -1.5670e+00 -3.7000e-01 -1.5360e+00 -7.8100e-01
 -1.7130e+00 -1.5990e+00 -1.0420e+00 -1.2110e+00 -6.0100e-01 -9.9600e-01
 -1.1810e+00 -1.6510e+00 -8.7300e-01 -3.1700e-01 -1.7870e+00 -2.4750e+00
 -2.0370e+00 -5.1990e+00 -3.0290e+00 -1.4290e+00 -1.1150e+00 -2.1250e+00
 -8.6700e-01 -3.3300e-01 -3.7470e+00 -2.8180e+00 -1.2250e+00 -4.4430e+00
 -9.6100e-01 -1.6700e-01 -6.6900e-01 -4.3400e-01 -1.1500e-01 -3.0290e+00
 -4.3040e+00 -3.4800e-01 -1.1840e+00 -1.5510e+00 -4.2900e-01 -2.9960e+00
 -3.8500e-01 -1.4720e+00 -1.8240e+00 -7.4200e-01 -3.9500e-01 -7.8900e-01
 -2.2000e-01 -2.8700e+00 -1.9280e+00 -8.1800e-01 -6.3000e-01 -1.4700e+00
 -8.1600e-01 -1.7110e+00 -1.7730e+00 -4.5000e-02 -3.9900e-01 -3.5600e-01
 -2.8600e-01 -5.7300e-01 -9.0000e-02 -1.7050e+00 -9.5400e-01 -7.6400e-01
 -4.4700e-01 -9.9500e-01 -1.0170e+00 -2.2170e+00 -1

In [3]:
# K Fold with K =10

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
# loocv evaluate random forest on the hosuing dataset

from numpy import mean
from numpy import std

kfold = KFold(10)
regressor = RandomForestRegressor(random_state=1)
scores = cross_val_score(regressor, X, y, scoring='neg_mean_absolute_error', cv=kfold, n_jobs=-1)

print(scores)
print('Accuracy of classification with KFold mean {} and standard deviation {}'.format(mean(scores), std(scores)) )


[-2.61635294 -1.49268627 -1.63845098 -3.62576471 -2.72854902 -3.088
 -2.06854    -6.23714    -3.71356    -2.78664   ]
Accuracy of classification with KFold mean -2.9995683921568626 and standard deviation 1.2902475800799147
