### K-fold Cross-Validation Algorithm: 
    
Dataset : cars for Regression & IRIS for Classification.


ML Model : kNN(3 nearest neighbours)

In [1]:
# importing libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import random 
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from statistics import mean

In [2]:
## Function for K fold Cross Validation Algorithm.

# The inputs to the function contain the data that can be split into train and validation sets & 
# the parameter-k mentioning the validation cycles required.
# Here, based on the target variable data type, Regression score or CLassification score are returned.
# The function returns the K-fold Cross validated aggregate score (Accuracy, RMSE).

def K_Fold_CV(k,X_train_valid,y_train_valid):
    
    # the chunck size is decided with the help of the k parameter in the input
    chunk_size = round(len(X_train_valid)/k)
    
    #  For indexing purpose, starting_index and ending_index are declared and intialized
    ending_index = chunk_size
    starting_index = 0
    
    # this list is to capture accuracy/RMSE for each fold
    list_accuracy_rmse = []
    
    # running loop for capturing each fold of data and validate
    for i in range(0,k):
        
        # Seperating validation data using indexing
        X_valid = X_train_valid.iloc[starting_index:ending_index,]
        y_valid = y_train_valid.iloc[starting_index:ending_index,]
        
        # Seperating train data using indexing
        # appending the observations to obtaining the remaining chunk of data
        X_train = X_train_valid.iloc[ending_index:len(X_train_valid),].append(X_train_valid.iloc[0:starting_index],)
        y_train = y_train_valid.iloc[ending_index:len(y_train_valid),].append(y_train_valid.iloc[0:starting_index],)
        
        # Moving the indexes for the next iterations
        starting_index = ending_index
        ending_index = ending_index + chunk_size
    
        # checking the training set data type to alter the flow
        if y_train.dtypes[0] == 'O':
            
            # using KNN Classifier with 3 nearest neighbors
            knn = KNeighborsClassifier(n_neighbors = 3)
            # Fitting the data and capturing the predicting value
            knn.fit(X_train,y_train)
            y_pred = knn.predict(X_valid)

            # For classification problem, accuracy score is captured
            accuracy = accuracy_score(y_valid,y_pred)

            # the values are being appended to the list
            list_accuracy_rmse.append(accuracy)
            
        else:
             # using KNN Regressor with 3 nearest neighbors
            knn = KNeighborsRegressor(n_neighbors = 3)
            knn.fit(X_train,y_train)
            y_pred = knn.predict(X_valid)
            
            # For Regressor problem RMSE score is calculated
            rmse = np.sqrt(np.mean((np.array(y_valid-y_pred)**2)))
            
            # the values are being appended to the list
            list_accuracy_rmse.append(rmse)
    
    # Aggregated score
    agg_value = mean(list_accuracy_rmse)
    
    # returning the aggregated value
    return agg_value
#--------------------------------------------------------------------------------------------#

# main program

# Regression demo

# data read as a data frame
cars = pd.read_csv("cars.csv")

# a copy of the data is taken to keep the main dataset intact
data = cars.copy()

# To shuffle the data sample is used
data = data.sample(len(data),random_state=100)

# columns are seperated based independent and dependent variables
X = data[["Weight","Horsepower", "Displacement","Acceleration"]]
y = data[["MPG"]]

# Train - test Split, the split is done in the form of 80:20 train test split

X_train = X.iloc[0:int(len(data)*0.8),:]

y_train = y.iloc[0:int(len(data)*0.8),:]

X_test = X.iloc[int(len(data)*0.8):,:]

y_test = y.iloc[int(len(data)*0.8):,:]


# the K-Fold Cross-Validation Algorithm  function is called and the returned aggreated value is printed.
print('\nRegression- cars dataset:')
print('\nAggreated K-fold CV score- RMSE:',K_Fold_CV(3,X_train,y_train))

#----------------------------------------------------------------------------------------------------#

# classification demo

# data read as a data frame
iris = pd.read_csv("IRIS.csv")

# a copy of the data is taken to keep the main data intact
data = iris.copy()

# To shuffle the data sample is used
data = data.sample(len(data),random_state=100)

# columns are seperated based independent and dependent variables
X = data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = data[["species"]]

# Train - test Split, the split is done in the form of 80:20 train test split

X_train = X.iloc[0:int(len(data)*0.8),:]

y_train = y.iloc[0:int(len(data)*0.8),:]

X_test = X.iloc[int(len(data)*0.8):,:]

y_test = y.iloc[int(len(data)*0.8):,:]


# the K-Fold Cross-Validation Algorithm  function is called and the returned aggreated value is printed.
print('\n\nClassification- IRIS dataset:')
print('\nAggreated K-fold CV score- Accuracy:',K_Fold_CV(3,X_train,y_train))



Regression- cars dataset:

Aggreated K-fold CV score- RMSE: 5.139255093137315


Classification- IRIS dataset:

Aggreated K-fold CV score- Accuracy: 0.95
