# üìò Cross-Validation Techniques ‚Äî Full Tutorial Notebook
### Enhanced & Structured Version

## 1Ô∏è‚É£ Import Libraries & Load Dataset

In [1]:

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import (
    KFold, StratifiedKFold, LeaveOneOut, LeavePOut,
    cross_val_score, cross_validate, TimeSeriesSplit,
    GroupKFold, ShuffleSplit, StratifiedShuffleSplit,
    RepeatedKFold, RepeatedStratifiedKFold
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


## 2Ô∏è‚É£ K-Fold Cross Validation

In [2]:

model = RandomForestClassifier(n_estimators=100, random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
print("Scores:", scores)
print("Mean:", scores.mean())


Scores: [1.         0.96666667 0.93333333 0.93333333 0.96666667]
Mean: 0.9600000000000002


## 3Ô∏è‚É£ Stratified K-Fold (Maintains Class Balance)

In [3]:

stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=stratified, scoring='accuracy')
print("Stratified K-Fold Scores:", scores)


Stratified K-Fold Scores: [0.96666667 0.96666667 0.93333333 0.96666667 0.9       ]


## 4Ô∏è‚É£ Leave-One-Out Cross Validation (LOOCV)

In [4]:

loo = LeaveOneOut()
small_X = X[:100]
small_y = y[:100]
model = LogisticRegression(max_iter=200)
scores = cross_val_score(model, small_X, small_y, cv=loo)
print("Mean Accuracy:", scores.mean())


Mean Accuracy: 1.0


## 5Ô∏è‚É£ Leave-P-Out (LPO)

In [5]:

from math import comb

X_small = X[:100]
y_small = y[:100]
p = 2

print("Total combinations:", comb(len(X_small), p))
lpo = LeavePOut(p=p)
model = LogisticRegression(max_iter=200)

scores = cross_val_score(model, X_small, y_small, cv=lpo)
print("Mean Accuracy:", scores.mean())


Total combinations: 4950
Mean Accuracy: 1.0


## 6Ô∏è‚É£ Time Series Split (For Sequential Data)

In [6]:

ts = TimeSeriesSplit(n_splits=5)
model = RandomForestClassifier()

ts_scores = cross_val_score(model, X, y, cv=ts, scoring='neg_mean_squared_error')
print("MSE:", -ts_scores.mean())


MSE: 0.44000000000000006


## 7Ô∏è‚É£ Group K-Fold

In [7]:

Xg, yg = make_classification(
    n_samples=150, n_features=10, n_informative=5, random_state=42
)
groups = np.repeat(np.arange(30), 5)

gkf = GroupKFold(n_splits=5)
for i, (train, test) in enumerate(gkf.split(Xg, yg, groups)):
    print(f"Fold {i}: Train={len(train)}, Test={len(test)}")


Fold 0: Train=120, Test=30
Fold 1: Train=120, Test=30
Fold 2: Train=120, Test=30
Fold 3: Train=120, Test=30
Fold 4: Train=120, Test=30


## 8Ô∏è‚É£ Shuffle Split

In [8]:

shuffle = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
scores = cross_val_score(model, X, y, cv=shuffle)
print("Shuffle Split Mean:", scores.mean())


Shuffle Split Mean: 0.9566666666666667


## 9Ô∏è‚É£ Stratified Shuffle Split

In [9]:

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
scores = cross_val_score(model, X, y, cv=sss)
print("Stratified Shuffle Split Mean:", scores.mean())


Stratified Shuffle Split Mean: 0.9466666666666667


## üîü Repeated K-Fold

In [10]:

rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)
scores = cross_val_score(model, X, y, cv=rkf)
print("Repeated K-Fold Mean:", scores.mean())


Repeated K-Fold Mean: 0.9540000000000001


## 1Ô∏è‚É£1Ô∏è‚É£ Repeated Stratified K-Fold

In [11]:

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
scores = cross_val_score(model, X, y, cv=rskf)
print("Repeated Stratified K-Fold Mean:", scores.mean())


Repeated Stratified K-Fold Mean: 0.9493333333333334
