Cross-validation is required in machine learning because it provides a more accurate estimate of how well a model will perform on unseen data, preventing overfitting and ensuring the model generalizes well to new situations by evaluating its performance across multiple subsets of the data, rather than just relying on a single train-test split; this allows for better model selection and hyperparameter tuning before deploying the model in real-world scenarios.

In [1]:
# Testing out cross validation with the help of various models. - Logistic regression, SVC and Random Forest

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [3]:
#Using the digits dataset from the sklearn library

from sklearn.datasets import load_digits

digits = load_digits()

In [4]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [6]:
#Splitting the dataset into training and testing

from sklearn.model_selection import train_test_split

#Defining X (independent variables or features) and y (dependent variable or target)

X = digits['data']
y = digits['target']

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.3)

## Using Log Reg as the first classifier

In [37]:
log_reg_model = LogisticRegression(max_iter=5000)
log_reg_model.fit(X_train, y_train)

In [38]:
log_reg_model.score(X_test, y_test)

0.9165275459098498

## Using SVC

In [39]:
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [40]:
svc_model.score(X_test, y_test)

0.9549248747913188

## Using Random Forest classifier 



In [41]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

In [42]:
random_forest.score(X_test, y_test)

0.9332220367278798

## If you run these again, the accuracies will change as the distribution of sample in X_train, X_test, y_train and y_test will change

# We can avoid such a situation by using k-fold cross validation where the data will use n possible subsets to avoid overfitting 

In [43]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 3) #creating 3 splits
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [44]:
#Using these folds (visualizing it in a way)

for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [45]:
# We can see that three folds have been created for the data that we supplied. Data has been divided into 3 
# folds of training and testing sets with each fold having different numbers 


# 1st fold is used for testing [0 1 2] and the remaining two folds for training [3 4 5] and [6 7 8]

# In the second iteration, [3 4 5] fold is used for testing and [0 1 2] and
# [6 7 8] for training

# And in the final iteration, [6 7 8] is used for testing and [0 1 2] and [3 4 5] for training

# Using K fold cross validation for the digits dataset

In [46]:
#Defining a function

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

#Using this method to train the model and return an accuracy score

In [49]:
#example of using the function/method

get_score(LogisticRegression(max_iter=5000),X_train, X_test, y_train, y_test)

0.9165275459098498

In [50]:
#Stratified k fold cross val

from sklearn.model_selection import StratifiedKFold #it's lil better when you are seperating out the folds, it will divide all the classification categories in a unifrom way

#eliminates bias to some extent 

In [51]:
#Specifiying the number of splits/folds we need
folds = StratifiedKFold(n_splits = 3)

In [58]:
#Initializing the arrays fro different models

scores_LR = []
scores_SVM = []
scores_RF = []

for train_index, test_index in kf.split(digits['data']):
    X_train, X_test, y_train, y_test = digits['data'][train_index], digits['data'][test_index], \
                                       digits['target'][train_index], digits['target'][test_index]
    
    scores_LR.append(get_score(LogisticRegression(max_iter=5000), X_train, X_test, y_train, y_test))
    scores_SVM.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    scores_RF.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))


In [59]:
scores_LR

[0.9282136894824707, 0.9415692821368948, 0.9165275459098498]

In [60]:
scores_SVM

[0.9666110183639399, 0.9816360601001669, 0.9549248747913188]

In [61]:
scores_RF

[0.9382303839732888, 0.9515859766277128, 0.9232053422370617]

# Cross val score method is a much better/efficient way to use instead of the above code (this was for demonstration purposes only xD)

In [62]:
from sklearn.model_selection import cross_val_score

In [64]:
cross_val_score(LogisticRegression(max_iter=5000), X,y)

array([0.925     , 0.875     , 0.93871866, 0.93314763, 0.89693593])

In [65]:
cross_val_score(SVC(), X,y)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [66]:
cross_val_score(RandomForestClassifier(), X,y)

array([0.93888889, 0.90833333, 0.96100279, 0.98050139, 0.92479109])