Cross-Validation Data Split Implementation
Write a Python function that performs k-fold cross-validation data splitting from scratch. The function should take a dataset (as a 2D NumPy array where each row represents a data sample and each column represents a feature) and an integer k representing the number of folds. The function should split the dataset into k parts, systematically use one part as the test set and the remaining as the training set, and return a list where each element is a tuple containing the training set and test set for each fold.


In [6]:
import numpy as np
def cross_validation(data,k):
  n_samples=data.shape[0]
  n_splits=n_samples//k # because / returns a float
  splits=[]
  for i in range(k):
    begin,end=i*n_splits,(i+1)*n_splits
    test=data[begin:end].tolist()
    train=np.concatenate((data[:begin],data[end:])).tolist()
    splits.append((train,test))
  return splits

In [7]:
#another solution
import numpy as np
def cross_validation2(data, k, seed):
  np.random.seed(seed)
  np.random.shuffle(data)

  n,m=data.shape
  sub_size=int(np.ceil(n/k))
  id_s=np.arange(0,n,sub_size)
  id_e=id_s+sub_size

  return [(np.concatenate((data[:id_s[i]],data[id_e[i]:]),axis=0).tolist(),data[id_s[i]:id_e[i]].tolist()) for i in range(k)]


In [8]:
data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
k = 5
print(cross_validation(data,k))
print(cross_validation2(data,k,seed=42))

[([[3, 4], [5, 6], [7, 8], [9, 10]], [[1, 2]]), ([[1, 2], [5, 6], [7, 8], [9, 10]], [[3, 4]]), ([[1, 2], [3, 4], [7, 8], [9, 10]], [[5, 6]]), ([[1, 2], [3, 4], [5, 6], [9, 10]], [[7, 8]]), ([[1, 2], [3, 4], [5, 6], [7, 8]], [[9, 10]])]
[([[9, 10], [5, 6], [1, 2], [7, 8]], [[3, 4]]), ([[3, 4], [5, 6], [1, 2], [7, 8]], [[9, 10]]), ([[3, 4], [9, 10], [1, 2], [7, 8]], [[5, 6]]), ([[3, 4], [9, 10], [5, 6], [7, 8]], [[1, 2]]), ([[3, 4], [9, 10], [5, 6], [1, 2]], [[7, 8]])]


The dataset is divided into 5 parts, each being used once as a test set while the remaining parts serve as the training set.
