In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import (
    KFold, StratifiedKFold, GroupKFold, 
    LeaveOneOut, LeavePOut, ShuffleSplit, 
    StratifiedShuffleSplit, cross_val_score
)

In [2]:
# Load dataset
df = pd.read_csv(r"C:\Users\KIIT\Downloads\cleaned_titanic_data.csv")

In [3]:
# Prepare features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

In [4]:
# Encode categorical variables if any
for col in X.select_dtypes(include=['object', 'category']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

In [5]:
# Define the model
model = RandomForestClassifier(random_state=42)

In [6]:
# 1. K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_scores = cross_val_score(model, X, y, cv=kf)
print("K-Fold CV scores:", kf_scores)

K-Fold CV scores: [0.82122905 0.81460674 0.81460674 0.81460674 0.83146067]


In [7]:
# 2. Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_scores = cross_val_score(model, X, y, cv=skf)
print("Stratified K-Fold CV scores:", skf_scores)

Stratified K-Fold CV scores: [0.83240223 0.82022472 0.79775281 0.80898876 0.84831461]


In [8]:
# 3. Leave-One-Out Cross Validation
loo = LeaveOneOut()
# loo_scores = cross_val_score(model, X, y, cv=loo)  # Uncomment for small datasets
# print("Leave-One-Out CV scores:", loo_scores)

In [9]:
# 4. Leave-P-Out Cross Validation (p=2 for demonstration)
lpo = LeavePOut(p=2)
# lpo_scores = cross_val_score(model, X, y, cv=lpo)  # Uncomment for very small datasets
# print("Leave-P-Out CV scores:", lpo_scores)

In [10]:
# 5. Shuffle Split
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
ss_scores = cross_val_score(model, X, y, cv=ss)
print("Shuffle Split CV scores:", ss_scores)

Shuffle Split CV scores: [0.81005587 0.82681564 0.80446927 0.82122905 0.82122905]


In [11]:
# 6. Stratified Shuffle Split
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
sss_scores = cross_val_score(model, X, y, cv=sss)
print("Stratified Shuffle Split CV scores:", sss_scores)

Stratified Shuffle Split CV scores: [0.81564246 0.78212291 0.77094972 0.81564246 0.78212291]


In [12]:
# 7. Group K-Fold (if 'group' column exists)
if 'group' in df.columns:
    gkf = GroupKFold(n_splits=5)
    groups = df['group']
    gkf_scores = cross_val_score(model, X, y, groups=groups, cv=gkf)
else:
    gkf_scores = None
if gkf_scores is not None:
    print("Group K-Fold CV scores:", gkf_scores)
else:
    print("Group K-Fold CV not performed (no 'group' column).")

Group K-Fold CV not performed (no 'group' column).
