In [None]:
import sys
from pathlib import Path

# Find project root = directory that contains "src"
cwd = Path.cwd()
root = cwd
while root != root.parent and not (root / "src").exists():
    root = root.parent

if not (root / "src").exists():
    raise RuntimeError(f"Could not find 'src' directory starting from {cwd}")

# Add project root to sys.path (NOT src itself)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

print("CWD:", cwd)
print("PROJECT_ROOT:", root)
print("Has src?:", (root / "src").exists())
print("Last sys.path entries:", sys.path[:5])

RANDOM_STATE = 42


In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from src.preprocessing import build_preprocessing_hgb_native

# Build the preprocessing pipeline
preprocessing, cat_idx = build_preprocessing_hgb_native(num_cols, cat_cols)
Xt = preprocessing.fit_transform(X_train)


In [None]:
from scipy.stats import randint, uniform  # Use scipy.stats for distributions
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

preprocessing, cat_idx = build_preprocessing_hgb_native(num_cols, cat_cols)


# Define the final HGB model with specified hyperparameters
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=150,
    max_leaf_nodes=30,
    min_samples_leaf=21,
    categorical_features=cat_idx,
    random_state=RANDOM_STATE
)

# Update the pipeline with the final model
pipe_final = Pipeline([("preprocess", preprocessing), ("model", hgb_final)])

# Fit the pipeline with the training data
pipe_final.fit(X_train, y_train)

Base (RandomizedSearch best):
    learning_rate ≈ 0.0678,
    max_iter = 121,
    max_leaf_nodes = 39,
    min_samples_leaf = 21,
    CV AP ≈ 0.855

Local grid around best (for GridSearch):
    learning_rate: [0.05, 0.07, 0.09],
    max_leaf_nodes: [30, 39, 45],
    min_samples_leaf: [15, 21, 27],
    max_iter: fixed to 150

other HGB params: as in default or best from RandomizedSearch

In [None]:
from pathlib import Path
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict

RANDOM_STATE = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# --- 1.1. Compute OOF probabilities for the leader model ---

oof_proba = cross_val_predict(
    pipe_final,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1,
)[:, 1]  # take probability for class 1

# --- 1.2. Load threshold  ---
threshold_path = Path("../reports/threshold_hgbn.npy")
threshold_path.parent.mkdir(parents=True, exist_ok=True)

threshold = float(np.load(threshold_path)[0])

# --- 1.3. Convert probabilities to class predictions using the chosen threshold ---
y_pred = (oof_proba >= threshold).astype(int)

# --- 1.4. Build DataFrame with raw features + predictions ---
oof_df = X_train.copy()
oof_df = oof_df.reset_index(drop=False)

oof_df["y_true"] = np.asarray(y_train)
oof_df["p_pred"] = oof_proba
oof_df["y_pred"] = y_pred

# --- 1.5. Save to CSV for manual inspection ---
reports_dir = Path("../reports")
reports_dir.mkdir(parents=True, exist_ok=True)

oof_path = reports_dir / "train_oof_leader.csv"
oof_df.to_csv(oof_path, index=False)

print(f"Saved OOF predictions to: {oof_path}")
print(oof_df.head())


In [None]:
# --- 2.1. Split errors into FP and FN ---

# False Positives: predicted 1, but true label is 0
fp_df = oof_df[(oof_df["y_true"] == 0) & (oof_df["y_pred"] == 1)].copy()

# False Negatives: predicted 0, but true label is 1
fn_df = oof_df[(oof_df["y_true"] == 1) & (oof_df["y_pred"] == 0)].copy()

print(f"FP count: {len(fp_df)}, FN count: {len(fn_df)}")

# --- 2.2. Take top-5 "worst" FP and FN by model confidence ---

# For FP: sort by p_pred descending
fp_top5 = fp_df.sort_values("p_pred", ascending=False).head(5)
fp_top5["error_type"] = "FP"

# For FN: sort by p_pred ascending
fn_top5 = fn_df.sort_values("p_pred", ascending=True).head(5)
fn_top5["error_type"] = "FN"

# --- 2.3. Combine and save ---
worst_cases = pd.concat([fp_top5, fn_top5], ignore_index=True)

worst_path = reports_dir / "worst_cases_top10.csv"
worst_cases.to_csv(worst_path, index=False)

print(f"Saved 10 worst cases to: {worst_path}")
print(worst_cases[["index", "y_true", "p_pred", "y_pred", "error_type"]])


In [None]:
from src.preprocessing import add_family_features


# Take a small sample to inspect
X_sample = X_train.head(10).copy()

X_with_family = add_family_features(X_sample)

print(X_with_family[["Age", "SibSp", "Parch", "family_size", "is_alone", "is_child"]])


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate

from src.preprocessing import build_preprocessing_hgb_native_with_family

RANDOM_STATE = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

num_cols = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
cat_cols = ["Sex", "Embarked"]

# 1) Old preprocessing (no family features)
preproc_old, cat_idx_old = build_preprocessing_hgb_native(
    num_cols=num_cols,
    cat_cols=cat_cols,
)

# 2) New preprocessing (with family features)
preproc_new, cat_idx_new = build_preprocessing_hgb_native_with_family(
    num_cols=num_cols,
    cat_cols=cat_cols,
)

# Best params from your tuning step (replace with your values)
best_params = {
    "learning_rate": 0.05,
    "max_depth": None,
    "max_leaf_nodes": 31,
    "min_samples_leaf": 20,
    "l2_regularization": 0.0,
    "max_iter": 300,
    "random_state": RANDOM_STATE,
}

# 3) Two pipelines: OLD and NEW
hgb_old = HistGradientBoostingClassifier(
    **best_params,
    categorical_features=cat_idx_old,
)

hgb_new = HistGradientBoostingClassifier(
    **best_params,
    categorical_features=cat_idx_new,
)

pipe_old = Pipeline([
    ("preprocess", preproc_old),
    ("model", hgb_old),
])

pipe_new = Pipeline([
    ("preprocess", preproc_new),
    ("model", hgb_new),
])

scoring = {
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision",
}

cv_old = cross_validate(
    pipe_old,
    X_train,
    y_train,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False,
)

cv_new = cross_validate(
    pipe_new,
    X_train,
    y_train,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False,
)

def summarize(name, res):
    roc_mean = res["test_roc_auc"].mean()
    roc_std = res["test_roc_auc"].std()
    pr_mean = res["test_pr_auc"].mean()
    pr_std = res["test_pr_auc"].std()
    print(f"{name}:")
    print(f"  ROC-AUC: {roc_mean:.3f} ± {roc_std:.3f}")
    print(f"  PR-AUC : {pr_mean:.3f} ± {pr_std:.3f}")
    print()

summarize("HGB OLD (no family)", cv_old)
summarize("HGB NEW (with family)", cv_new)


In [None]:
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", cm)