

## **StratifiedShuffleSplit**

**What it is:**  
- A combination of stratification and shuffling.
- It ensures that each random split maintains the same class distribution as the full dataset.

**When to use it:**  
- When you need random splits for classification tasks while preserving class balance in each split.

**Key points:**  
- Provides multiple randomized train/test splits with stratification.


In [8]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

X = np.arange(20).reshape(10, 2)
y = np.array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1])

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)


TRAIN: [4 7 1 8 5 3 2] TEST: [0 6 9]
TRAIN: [3 7 0 8 5 2 9] TEST: [1 6 4]
TRAIN: [7 8 3 9 4 2 0] TEST: [1 5 6]
TRAIN: [7 9 5 1 8 0 2] TEST: [4 3 6]
TRAIN: [0 7 5 3 6 8 9] TEST: [1 2 4]


In [9]:
import seaborn as sns 
import pandas as pd
# from sklearn.model_selection import StratifiedShuffleSplit

df=sns.load_dataset('tips')
df["tip-range"] = pd.cut(
    df['tip'],
    bins=[0., 1, 2., 3, 4., 5., np.inf],
    labels=[1, 2, 3, 4, 5, 6]
)
df['tip-range'].value_counts()/len(df)


tip-range
2    0.303279
3    0.278689
4    0.233607
5    0.094262
6    0.073770
1    0.016393
Name: count, dtype: float64

In [10]:
sss=StratifiedShuffleSplit(n_splits=5,test_size=0.3,random_state=42)
for fold,(train_ids, test_ids) in enumerate(sss.split(df.drop(columns=['tip','tip-range']),df['tip-range'])):
    print(f"################### Fold {fold + 1} ###################")

    train_distribution = df.loc[train_ids, 'tip-range'].value_counts(normalize=True)
    test_distribution = df.loc[test_ids, 'tip-range'].value_counts(normalize=True)

    print("Train Distribution:\n", train_distribution)
    print("Test Distribution:\n", test_distribution)
    
    print("############################################################\n")


################### Fold 1 ###################
Train Distribution:
 tip-range
2    0.305882
3    0.276471
4    0.235294
5    0.094118
6    0.070588
1    0.017647
Name: proportion, dtype: float64
Test Distribution:
 tip-range
2    0.297297
3    0.283784
4    0.229730
5    0.094595
6    0.081081
1    0.013514
Name: proportion, dtype: float64
############################################################

################### Fold 2 ###################
Train Distribution:
 tip-range
2    0.305882
3    0.276471
4    0.235294
5    0.094118
6    0.070588
1    0.017647
Name: proportion, dtype: float64
Test Distribution:
 tip-range
2    0.297297
3    0.283784
4    0.229730
5    0.094595
6    0.081081
1    0.013514
Name: proportion, dtype: float64
############################################################

################### Fold 3 ###################
Train Distribution:
 tip-range
2    0.305882
3    0.276471
4    0.235294
5    0.094118
6    0.070588
1    0.017647
Name: proportion, dtype: float