
## **StratifiedKFold**

**What it is:**  
- Similar to KFold but ensures that each fold maintains roughly the same percentage of samples for each target class (i.e., preserves class distribution).

**When to use it:**  
- When dealing with classification problems, especially if classes are imbalanced.

**Key points:**  
- Only applicable for classification tasks where `y` is categorical.


In [13]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Suppose we have a binary classification problem
X = np.arange(35).reshape(7, 5)
y = np.array([0, 1, 0, 0, 0, 1, 1])

print(X)
print(y)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)


[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]]
[0 1 0 0 0 1 1]
TRAIN: [2 4 5 6] TEST: [0 1 3]
TRAIN: [0 1 2 3 6] TEST: [4 5]
TRAIN: [0 1 3 4 5] TEST: [2 6]


In [14]:
import seaborn as sns
import pandas as pd
# from sklearn.model_selection import StratifiedKFold

df=sns.load_dataset('tips')
df["tip-range"] = pd.cut(
    df['tip'],
    bins=[0., 1, 2., 3, 4., 5., np.inf],
    labels=[1, 2, 3, 4, 5, 6]
)
df['tip-range'].value_counts()/len(df)

tip-range
2    0.303279
3    0.278689
4    0.233607
5    0.094262
6    0.073770
1    0.016393
Name: count, dtype: float64

In [18]:
skf=StratifiedKFold(n_splits=3,shuffle=True,random_state=42)
for fold,(train_ids, test_ids) in enumerate(skf.split(df.drop(columns=['tip','tip-range']),df['tip-range'])):
    print(f"################### Fold {fold + 1} ###################")

    train_distribution = df.loc[train_ids, 'tip-range'].value_counts(normalize=True)
    test_distribution = df.loc[test_ids, 'tip-range'].value_counts(normalize=True)

    print("Train Distribution:\n", train_distribution)
    print("Test Distribution:\n", test_distribution)
    
    print("############################################################\n")


################### Fold 1 ###################
Train Distribution:
 tip-range
2    0.302469
3    0.283951
4    0.234568
5    0.092593
6    0.074074
1    0.012346
Name: proportion, dtype: float64
Test Distribution:
 tip-range
2    0.304878
3    0.268293
4    0.231707
5    0.097561
6    0.073171
1    0.024390
Name: proportion, dtype: float64
############################################################

################### Fold 2 ###################
Train Distribution:
 tip-range
2    0.300613
3    0.276074
4    0.233129
5    0.098160
6    0.073620
1    0.018405
Name: proportion, dtype: float64
Test Distribution:
 tip-range
2    0.308642
3    0.283951
4    0.234568
5    0.086420
6    0.074074
1    0.012346
Name: proportion, dtype: float64
############################################################

################### Fold 3 ###################
Train Distribution:
 tip-range
2    0.306748
3    0.276074
4    0.233129
5    0.092025
6    0.073620
1    0.018405
Name: proportion, dtype: float