In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

RANDOM_STATE = 42

# ============================================================
# 1. DATA SPLITTING — 80/20, Test Set Locked Away
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=RANDOM_STATE
)
print(f"Data Split: Train={len(X_train)} | Test={len(X_test)}")

# ============================================================
# 2. FORWARD FEATURE SELECTION (CV on Train only)
# Uses 5-fold stratified CV instead of a single val set
# — much more stable estimates, no data wasted
# ============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

selected_idx = []
remaining_idx = list(range(X.shape[1]))
best_auc = 0.0

print("\n--- Running Forward Selection (5-Fold CV) ---")

for step in range(MAX_FEATURES):
    step_results = []

    for i in remaining_idx:
        trial_idx = selected_idx + [i]

        # Pipeline handles scaler + model inside each CV fold — no leakage
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", RandomForestClassifier(
                n_estimators=300,
                max_depth=5,
                class_weight="balanced",
                random_state=RANDOM_STATE,
                n_jobs=-1
            ))
        ])

        scores = cross_val_score(
            pipe,
            X_train[:, trial_idx],
            y_train,
            cv=cv,
            scoring="roc_auc",
            n_jobs=-1
        )
        step_results.append((i, scores.mean()))

    best_i, best_step_auc = max(step_results, key=lambda x: x[1])

    if best_step_auc <= best_auc + 1e-4:
        print("No further improvement — stopping.")
        break

    selected_idx.append(best_i)
    remaining_idx.remove(best_i)
    best_auc = best_step_auc
    print(f"Step {step+1}: Added '{FEATURES[best_i]}' | CV AUC: {best_auc:.4f}")

# ============================================================
# 3. FINAL MODEL — Retrain on ALL train data, evaluate on test
# ============================================================
pipe_final = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=5,
        class_weight="balanced",
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

pipe_final.fit(X_train[:, selected_idx], y_train)

final_probs = pipe_final.predict_proba(X_test[:, selected_idx])[:, 1]
final_preds = pipe_final.predict(X_test[:, selected_idx])

auc = roc_auc_score(y_test, final_probs)
acc = accuracy_score(y_test, final_preds)
f1  = f1_score(y_test, final_preds)

print("\n" + "=" * 45)
print(f"SELECTED FEATURES : {FEATURES[selected_idx].tolist()}")
print(f"TEST AUC          : {auc:.4f}")
print(f"TEST ACCURACY     : {acc:.4f}")
print(f"TEST F1           : {f1:.4f}")
print("=" * 45)

NameError: name 'X' is not defined