# Cross Validation - K-Fold


- cross_val_score(): 
  - split the data into folds AND  
  - train the model  AND 
  - Get Score (accuracy) for each fold

In [7]:
import pandas as pd 

from sklearn.datasets import load_digits
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix 
import seaborn as sn 


from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

import numpy as np

# Load Data

In [4]:
digits = load_digits()   # 8x8 pixels

# Build The model

In [5]:
NUM_OF_CROSS_VALIDATION = 10

In [6]:
model = RandomForestClassifier(n_estimators = 20)


# Get 10x Scores for all folds
print(cross_val_score(model, digits.data, digits.target, cv = NUM_OF_CROSS_VALIDATION))

[0.90555556 0.96666667 0.94444444 0.92222222 0.95555556 0.97222222
 0.97777778 0.96648045 0.93296089 0.9273743 ]


# Try Several Model Hyper Parameter with 10x Folds

In [8]:
score20 = cross_val_score(RandomForestClassifier(n_estimators = 20), digits.data, digits.target, cv = NUM_OF_CROSS_VALIDATION)
score50 = cross_val_score(RandomForestClassifier(n_estimators = 50), digits.data, digits.target, cv = NUM_OF_CROSS_VALIDATION)
score100 = cross_val_score(RandomForestClassifier(n_estimators = 100), digits.data, digits.target, cv = NUM_OF_CROSS_VALIDATION)
score200 = cross_val_score(RandomForestClassifier(n_estimators = 200), digits.data, digits.target, cv = NUM_OF_CROSS_VALIDATION)

#  Get the average Score for each Train Process

In [9]:
print(np.average(score20)) # 0.9337740533829919
print(np.average(score50)) # 0.95268156424581   --> The best
print(np.average(score100)) # 0.9482371198013656
print(np.average(score200)) # 0.9471322160148976

0.9337740533829919
0.95268156424581
0.9482371198013656
0.9471322160148976


# K-Fold Splitter

- Splits only the data

In [11]:
from sklearn.model_selection import KFold

In [12]:
# Create a KFold Instance
kfold = KFold(n_splits = 10, shuffle = True)

# Create a model
model = RandomForestClassifier(n_estimators = 20)


# Train the model using folds and get the scores for each fold
# Instead of giviing cv = 10 (hard coded), i give an instance of a K-Folder
scores = cross_val_score(model, digits.data, digits.target, cv = kfold)


In [13]:
# Evaluate the mean and standard deviation of the scores
print(scores.mean(), scores.std())

0.9437957790192426 0.02270820768895843


# Stratisfied K-Folder

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
stratisfiedkfold = StratifiedKFold(n_splits= 10, shuffle= True)



# Create a model
model = RandomForestClassifier(n_estimators = 20)


# Train the model using folds and get the scores for each fold
# Instead of giviing cv = 10 (hard coded), i give an instance of a K-Folder
scores = cross_val_score(model, digits.data, digits.target, cv = stratisfiedkfold)


In [16]:
# Evaluate the mean and standard deviation of the scores
print(scores.mean(), scores.std())

0.9671570453134699 0.010120221911398978


# RepeatedStratisfiedKFold

Trains the model nx times with diffeerent randomization in each repition

Required: 
- 10x folds , 5x repeats ---> 50x Training process 

In [18]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [20]:
repeatedstratisfiedkfold = RepeatedStratifiedKFold(n_splits= 10,n_repeats= 5)

# Create a model
model = RandomForestClassifier(n_estimators = 20)

# Train the model using folds and get the scores for each fold
# Instead of giviing cv = 10 (hard coded), i give an instance of a K-Folder
scores = cross_val_score(model, digits.data, digits.target, cv = repeatedstratisfiedkfold)

In [21]:
# Evaluate the mean and standard deviation of the scores
print(scores.mean(), scores.std())

0.9662793296089386 0.013071665148810923
