In [None]:
import sys
from pathlib import Path

# Find project root = directory that contains "src"
cwd = Path.cwd()
root = cwd
while root != root.parent and not (root / "src").exists():
    root = root.parent

if not (root / "src").exists():
    raise RuntimeError(f"Could not find 'src' directory starting from {cwd}")

# Add project root to sys.path (NOT src itself)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

print("CWD:", cwd)
print("PROJECT_ROOT:", root)
print("Has src?:", (root / "src").exists())
print("Last sys.path entries:", sys.path[:5])

RANDOM_STATE = 42


In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv(root / 'data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, average_precision_score, roc_auc_score


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)


In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
cat_cols = ["Sex", "Embarked"]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from src.preprocessing import build_preprocessing_hgb_native_with_family


preprocessing, cat_idx = build_preprocessing_hgb_native_with_family(num_cols, cat_cols)


# Define the final HGB model with specified hyperparameters
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=150,
    max_leaf_nodes=30,
    min_samples_leaf=21,
    categorical_features=cat_idx,
    random_state=RANDOM_STATE
)

pipe_final = Pipeline([("preprocess", preprocessing), ("model", hgb_final)])


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


oof_proba_baseline = cross_val_predict(
    pipe_final,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1,
)


In [None]:
positive_class_proba_baseline = oof_proba_baseline[:, 1]

# Brier Score (uncalibrated)
brier_score = round(brier_score_loss(y_train, positive_class_proba_baseline), 4)

# PR-AUC (AP)
pr_auc = round(average_precision_score(y_train, positive_class_proba_baseline), 4)

# ROC-AUC
roc_auc = round(roc_auc_score(y_train, positive_class_proba_baseline), 4)

print(f"Brier Score (Uncalibrated): {brier_score}")
print(f"PR-AUC (AP): {pr_auc}")
print(f"ROC-AUC: {roc_auc}")


In [None]:
# Define the calibrated pipeline using sigmoid method (Platt scaling)
calibrated_pipe_platt = CalibratedClassifierCV(pipe_final, method='sigmoid', cv=5)


In [None]:
oof_proba_platt = cross_val_predict(
    calibrated_pipe_platt,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1,
)


In [None]:
positive_class_proba_platt = oof_proba_platt[:, 1]

# Brier Score (Platt calibrated)
brier_score = round(brier_score_loss(y_train, positive_class_proba_platt), 4)

# PR-AUC (AP)
pr_auc = round(average_precision_score(y_train, positive_class_proba_platt), 4)

# ROC-AUC
roc_auc = round(roc_auc_score(y_train, positive_class_proba_platt), 4)

print(f"Brier Score (Platt): {brier_score}")
print(f"PR-AUC (AP): {pr_auc}")
print(f"ROC-AUC: {roc_auc}")


In [None]:
# Define the calibrated pipeline using isotonic regression
calibrated_pipe_isotonic = CalibratedClassifierCV(pipe_final, method='isotonic', cv=5)

oof_proba_isotonic = cross_val_predict(
    calibrated_pipe_isotonic,
    X_train,
    y_train,
    cv=cv,
    method="predict_proba",
    n_jobs=-1,
)


In [None]:
positive_class_proba_isotonic = oof_proba_isotonic[:, 1]

# Brier Score (Isotonic calibrated)
brier_score = round(brier_score_loss(y_train, positive_class_proba_isotonic), 4)

# PR-AUC (AP)
pr_auc = round(average_precision_score(y_train, positive_class_proba_isotonic), 4)

# ROC-AUC
roc_auc = round(roc_auc_score(y_train, positive_class_proba_isotonic), 4)

print(f"Brier Score: {brier_score}")
print(f"PR-AUC (AP): {pr_auc}")
print(f"ROC-AUC: {roc_auc}")
