In [None]:
import numpy as np
import pandas as pd

# K Fold Cross Validation

In [1]:
class CrossValidation:
    def __init__(self,k=10, max_iteration=10):
        self.k=k
        self.max_iteration=max_iteration
    
    
    def split_train_test(self,data,step):
        
        '''
            This method split and prepare the trainning and the validation set.
            
            It takes the data that was shuffled and split using the k
        '''
        #print('data in split',data)
        #print('setp',step)
        data = data[:]
        
        data_test = data[step]
        X_test = data_test[0]
        y_test =data_test[1]
        
        X_train = []
        y_train =[]
        data_train=data.pop(step)
        
        for tup in data_train:
            X_train.append(tup[0])
            y_train.append(tup[1])
        return X_train, y_train, X_test, y_test
            
    def cross_validation_split(self,X,y):
        
        '''
            This method randomly split the data into k subset of size size_of_initial_data/k
        '''
        
        #size of each subset
        permut_index=np.random.permutation(len(X))
        X = X[permut_index]
        y = y[permut_index]
        size_subset=int(len(X)/self.k)
        split_data=[]
        
        for i in range(self.k):
            X_s = X[i*size_subset:(i+1)*size_subset]
            y_s = y[i*size_subset: (i+1)*size_subset]
            split_data.append((X_s,y_s))
        return split_data 
            
    
    def cross_validation(self, learning_rate, X, y):
        
        '''
            This method takes the list of model and the data, return the best model with his RMSE 
        '''
        
        #randomly split the data
        data_split = self.cross_validation_split(X,y)
        
        generalization_error=[]
        
        for alpha in learning_rate:
            errors=[]
            #get the model with learning rate alpha
            linearReg = LinearRegression(lr=alpha, max_iter=self.max_iteration)
            for i in range(self.k):
                #split the big splitting data into train and validation set
                #print('iteration',i)
                #print('my data is of lenght',len(data_split))
                X_train,y_train,X_test,y_test = self.split_train_test(data_split, i)
                #train the model
                linearReg.fit_SGD(np.array(X_train),np.array(y_train))
                er=linearReg.error_prediction(X_test,y_test)
                errors.append(er)
            mean_errors=np.mean(errors)
            generalization_error.append(mean_errors)
            model=learning_rate[np.argmin(generalization_error)]
        return {'Best_hyperparameter': model ,'RMSE': np.min(generalization_error) } 