
## **KFold**

**What it is:**  
- Splits the dataset into *k* consecutive folds (without shuffling by default).  
- Each fold is used once as a validation while the *k–1* remaining folds form the training set.

**When to use it:**  
- For general cross-validation when no special grouping or stratification is needed.

**Key points:**  
- The splits are deterministic unless you set `shuffle=True` with a `random_state`.
- Does not preserve the distribution of labels (i.e., not stratified).


In [3]:
from sklearn.model_selection import KFold
import numpy as np

X = np.arange(35).reshape(7, 5)
y = np.arange(7)


print(X)
print(y)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)


[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]]
[0 1 2 3 4 5 6]
TRAIN: [2 3 4 5 6] TEST: [0 1]
TRAIN: [0 1 3 4 6] TEST: [2 5]
TRAIN: [0 1 2 3 5 6] TEST: [4]
TRAIN: [0 1 2 4 5 6] TEST: [3]
TRAIN: [0 1 2 3 4 5] TEST: [6]


In [4]:
import seaborn as sns 
import pandas as pd
# from sklearn.model_selection import KFold

df=sns.load_dataset('tips')
df["tip-range"] = pd.cut(
    df['tip'],
    bins=[0., 1, 2., 3, 4., 5., np.inf],
    labels=[1, 2, 3, 4, 5, 6]
)
df['tip-range'].value_counts()/len(df)


tip-range
2    0.303279
3    0.278689
4    0.233607
5    0.094262
6    0.073770
1    0.016393
Name: count, dtype: float64

In [5]:

kf=KFold(n_splits=3,shuffle=True,random_state=42)
for fold, (train_ids, test_ids) in enumerate(kf.split(df)):
    print(f"################### Fold {fold + 1} ###################")

    train_distribution = df.loc[train_ids, 'tip-range'].value_counts(normalize=True)
    test_distribution = df.loc[test_ids, 'tip-range'].value_counts(normalize=True)

    print("Train Distribution:\n", train_distribution)
    print("Test Distribution:\n", test_distribution)
    
    print("############################################################\n")


################### Fold 1 ###################
Train Distribution:
 tip-range
2    0.302469
3    0.259259
4    0.246914
5    0.092593
6    0.086420
1    0.012346
Name: proportion, dtype: float64
Test Distribution:
 tip-range
3    0.317073
2    0.304878
4    0.207317
5    0.097561
6    0.048780
1    0.024390
Name: proportion, dtype: float64
############################################################

################### Fold 2 ###################
Train Distribution:
 tip-range
2    0.319018
3    0.276074
4    0.233129
5    0.085890
6    0.061350
1    0.024540
Name: proportion, dtype: float64
Test Distribution:
 tip-range
3    0.283951
2    0.271605
4    0.234568
5    0.111111
6    0.098765
1    0.000000
Name: proportion, dtype: float64
############################################################

################### Fold 3 ###################
Train Distribution:
 tip-range
3    0.300613
2    0.288344
4    0.220859
5    0.104294
6    0.073620
1    0.012270
Name: proportion, dtype: float