# Imports

In [1]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.tree import DecisionTreeClassifier


# Topics

## How KFold Split Work ? (On hand example)

In [10]:
# Create a dataset
X = np.arange(20).reshape((10, 2))
y = np.sort(np.array([0,1] * 5))

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Retrieve indices and perform training
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Print the indices for illustration
    print("TRAIN indices:", train_index, "VALIDATION indices:", val_index)
    
    # Training a simple model (DecisionTreeClassifier)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    
    # Validate the model
    score = model.score(X_val, y_val)
    print("Validation Score:", score)


[0 0 0 0 0 1 1 1 1 1]
TRAIN indices: [0 2 3 4 5 6 7 9] VALIDATION indices: [1 8]
Validation Score: 1.0
TRAIN indices: [1 2 3 4 6 7 8 9] VALIDATION indices: [0 5]
Validation Score: 0.5
TRAIN indices: [0 1 3 4 5 6 8 9] VALIDATION indices: [2 7]
Validation Score: 1.0
TRAIN indices: [0 1 2 3 5 6 7 8] VALIDATION indices: [4 9]
Validation Score: 1.0
TRAIN indices: [0 1 2 4 5 7 8 9] VALIDATION indices: [3 6]
Validation Score: 1.0


One can simply use cross_val score to do the same thing. But I will show you how it works.

In [14]:
# using cross_val score to achieve some of the above
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier()
scores = cross_val_score(model, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=42))
print("Validation Scores:", scores)
print("Mean Validation Score:", np.mean(scores))




Validation Scores: [1.  0.5 1.  1.  1. ]
Mean Validation Score: 0.9
