In [2]:
import pandas as pd
from pathlib import Path

In [18]:
import numpy as np

In [3]:
folder = Path("data_set")
df = pd.read_csv(folder / "titanic.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Age,Fare,Survived
0,1,22.0,7.25,0
1,2,38.0,71.2833,1
2,3,26.0,7.925,1
3,4,35.0,53.1,1
4,5,35.0,8.05,0


In [5]:
df['Survived'].value_counts()

Survived
0    424
1    290
Name: count, dtype: int64

In [8]:
X = df.drop("Survived",axis=1)
y = df["Survived"]

### train_test_split

In [6]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
y_train.value_counts(normalize=True)*100

Survived
0    60.041841
1    39.958159
Name: proportion, dtype: float64

In [14]:
y_test.value_counts(normalize=True)*100

Survived
0    58.050847
1    41.949153
Name: proportion, dtype: float64

In [15]:
# stratify = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [16]:
y_train.value_counts(normalize=True)*100

Survived
0    59.414226
1    40.585774
Name: proportion, dtype: float64

In [17]:
y_test.value_counts(normalize=True)*100

Survived
0    59.322034
1    40.677966
Name: proportion, dtype: float64

In [19]:
np.bincount(y_train)

array([284, 194])

In [20]:
y_train.value_counts()

Survived
0    284
1    194
Name: count, dtype: int64

### ShuffleSplit

Random Sampling, No Stratification \
No guarantee of class balance \
Test sets may overlap across splits \
Best For Regression \
Original Data: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Target:        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  (5 class-0, 5 class-1)

Split 1:
  Train: [1, 3, 5, 7, 9, 2]  → Target: [0, 0, 0, 1, 1, 0]  (3 class-0, 3 class-1)\
  Test:  [4, 6, 8, 10]       → Target: [0, 1, 1, 1]        (1 class-0, 3 class-1) \
  
Split 2:
  Train: [2, 4, 6, 8, 10, 1] → Target: [0, 0, 1, 1, 1, 0]  (3 class-0, 3 class-1)\
  Test:  [3, 5, 7, 9]        → Target: [0, 0, 1, 1]        (2 class-0, 2 class-1)\
just randomly splits with shuffle and splits

In [21]:
from sklearn.model_selection import ShuffleSplit

In [22]:
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [28]:
splits_X = []
splits_y = []
for fold, (train_idx, test_idx) in enumerate(ss.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    splits_X.append([X_train, X_test])
    splits_y.append([y_train, y_test])
    
    print(f"fold {fold}: train = {len(train_idx)}, test = {len(test_idx)} ")
    print(f"   Train class distribution {np.array(y_train.value_counts(normalize=True)*100)} ")
    print(f"   Test class distribution {np.array(y_test.value_counts(normalize=True)*100)} ")
    print()

fold 0: train = 571, test = 143 
   Train class distribution [59.01926445 40.98073555] 
   Test class distribution [60.83916084 39.16083916] 

fold 1: train = 571, test = 143 
   Train class distribution [59.71978984 40.28021016] 
   Test class distribution [58.04195804 41.95804196] 

fold 2: train = 571, test = 143 
   Train class distribution [58.8441331 41.1558669] 
   Test class distribution [61.53846154 38.46153846] 

fold 3: train = 571, test = 143 
   Train class distribution [58.8441331 41.1558669] 
   Test class distribution [61.53846154 38.46153846] 

fold 4: train = 571, test = 143 
   Train class distribution [59.89492119 40.10507881] 
   Test class distribution [57.34265734 42.65734266] 



In [31]:
# take first split
X_train, X_test = splits_X[0]
y_train, y_test = splits_y[0]
print(np.array(y_train.value_counts(normalize=True)*100))
print(np.array(y_test.value_counts(normalize=True)*100))

[59.01926445 40.98073555]
[60.83916084 39.16083916]


## StratifiedShuffleSplit

- Randomly selects samples for train/test
- **Maintains class proportions** in each split
- Each split is independent
- Test sets may overlap across splits


In [34]:
from sklearn.model_selection import StratifiedShuffleSplit

In [43]:
sss = StratifiedShuffleSplit(n_splits=5,test_size=0.3,random_state=42)

In [44]:
splits_X = []
splits_y = []
for fold, (train_idx, test_idx) in enumerate(sss.split(X,y)): # note: needs y
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    splits_X.append([X_train, X_test])
    splits_y.append([y_train, y_test])
    
    print(f"fold {fold}: train = {len(train_idx)}, test = {len(test_idx)} ")
    print(f"   Train class distribution {np.array(y_train.value_counts(normalize=True)*100)} ")
    print(f"   Test class distribution {np.array(y_test.value_counts(normalize=True)*100)} ")
    print()

fold 0: train = 499, test = 215 
   Train class distribution [59.31863727 40.68136273] 
   Test class distribution [59.53488372 40.46511628] 

fold 1: train = 499, test = 215 
   Train class distribution [59.31863727 40.68136273] 
   Test class distribution [59.53488372 40.46511628] 

fold 2: train = 499, test = 215 
   Train class distribution [59.31863727 40.68136273] 
   Test class distribution [59.53488372 40.46511628] 

fold 3: train = 499, test = 215 
   Train class distribution [59.31863727 40.68136273] 
   Test class distribution [59.53488372 40.46511628] 

fold 4: train = 499, test = 215 
   Train class distribution [59.31863727 40.68136273] 
   Test class distribution [59.53488372 40.46511628] 



In [37]:
# take first split
X_train, X_test = splits_X[0]
y_train, y_test = splits_y[0]
print(np.array(y_train.value_counts(normalize=True)*100))
print(np.array(y_test.value_counts(normalize=True)*100))

[59.36952715 40.63047285]
[59.44055944 40.55944056]


In [39]:
# if we observe above, same ratio in train and test in classes 0 and 1

## StratifiedKFold

- Splits data into K equal folds **systematically**
- Each sample appears in test set **exactly once**
- **Maintains class proportions** in each fold
- **All data is used** for both training and testing

In [40]:
from sklearn.model_selection import StratifiedKFold

In [41]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [42]:
splits_X = []
splits_y = []
for fold, (train_idx, test_idx) in enumerate(skf.split(X,y)): # note: needs y
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    splits_X.append([X_train, X_test])
    splits_y.append([y_train, y_test])
    
    print(f"fold {fold}: train = {len(train_idx)}, test = {len(test_idx)} ")
    print(f"   Train class distribution {np.array(y_train.value_counts(normalize=True)*100)} ")
    print(f"   Test class distribution {np.array(y_test.value_counts(normalize=True)*100)} ")
    print()

fold 0: train = 571, test = 143 
   Train class distribution [59.36952715 40.63047285] 
   Test class distribution [59.44055944 40.55944056] 

fold 1: train = 571, test = 143 
   Train class distribution [59.36952715 40.63047285] 
   Test class distribution [59.44055944 40.55944056] 

fold 2: train = 571, test = 143 
   Train class distribution [59.36952715 40.63047285] 
   Test class distribution [59.44055944 40.55944056] 

fold 3: train = 571, test = 143 
   Train class distribution [59.36952715 40.63047285] 
   Test class distribution [59.44055944 40.55944056] 

fold 4: train = 572, test = 142 
   Train class distribution [59.44055944 40.55944056] 
   Test class distribution [59.15492958 40.84507042] 



In [45]:
from sklearn.linear_model import LogisticRegression

In [49]:
scores = []
for fold, (train_idx, test_idx) in enumerate(skf.split(X,y)): # note: needs y
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model = LogisticRegression()
    model.fit(X_train,y_train)
    score = model.score(X_test,y_test)
    scores.append(score)
    print(f"Fold {fold+1}: Accuracy = {score:.2f}")

Fold 1: Accuracy = 0.67
Fold 2: Accuracy = 0.69
Fold 3: Accuracy = 0.62
Fold 4: Accuracy = 0.70
Fold 5: Accuracy = 0.63


In [54]:
print(f"Mean CV accuracy: {np.mean(scores):.4f}")

Mean CV accuracy: 0.6624


### KFold

generate randomly k folds without class balances

In [55]:
from sklearn.model_selection import KFold

In [56]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [57]:
splits_X = []
splits_y = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X)): # note: no needs y
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    splits_X.append([X_train, X_test])
    splits_y.append([y_train, y_test])
    
    print(f"fold {fold}: train = {len(train_idx)}, test = {len(test_idx)} ")
    print(f"   Train class distribution {np.array(y_train.value_counts(normalize=True)*100)} ")
    print(f"   Test class distribution {np.array(y_test.value_counts(normalize=True)*100)} ")
    print()

fold 0: train = 571, test = 143 
   Train class distribution [59.01926445 40.98073555] 
   Test class distribution [60.83916084 39.16083916] 

fold 1: train = 571, test = 143 
   Train class distribution [60.59544658 39.40455342] 
   Test class distribution [54.54545455 45.45454545] 

fold 2: train = 571, test = 143 
   Train class distribution [59.01926445 40.98073555] 
   Test class distribution [60.83916084 39.16083916] 

fold 3: train = 571, test = 143 
   Train class distribution [58.4938704 41.5061296] 
   Test class distribution [62.93706294 37.06293706] 

fold 4: train = 572, test = 142 
   Train class distribution [59.79020979 40.20979021] 
   Test class distribution [57.74647887 42.25352113] 



**Decision Tree: Which to Use?**
```
Is it a Classification problem?
├─ YES → Are classes imbalanced?
│   ├─ YES → Want reliable estimates?
│   │   ├─ YES → Use StratifiedKFold ⭐
│   │   └─ NO  → Use StratifiedShuffleSplit (faster)
│   └─ NO → Use StratifiedKFold or regular KFold
└─ NO (Regression) → Use KFold or ShuffleSplit