In [None]:
import sys
from pathlib import Path

# Find project root = directory that contains "src"
cwd = Path.cwd()
root = cwd
while root != root.parent and not (root / "src").exists():
    root = root.parent

if not (root / "src").exists():
    raise RuntimeError(f"Could not find 'src' directory starting from {cwd}")

# Add project root to sys.path (NOT src itself)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

print("CWD:", cwd)
print("PROJECT_ROOT:", root)
print("Has src?:", (root / "src").exists())
print("Last sys.path entries:", sys.path[:5])

RANDOM_STATE = 42


In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv(root / 'data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
cat_cols = ["Sex", "Embarked"]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from src.preprocessing import build_preprocessing_hgb_native

# Build preprocessing pipeline and get categorical indices
preprocessing, cat_idx = build_preprocessing_hgb_native(num_cols, cat_cols)

# Define the final model with optimized hyperparameters
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=150, 
    max_leaf_nodes=30,
    min_samples_leaf=21,
    categorical_features=cat_idx,
    random_state=RANDOM_STATE
)

# Create the final pipeline
pipe_final = Pipeline([("preprocess", preprocessing), ("model", hgb_final)])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict

# Perform cross-validated predictions on the training set
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predicted probabilities
p_final = cross_val_predict(pipe_final, X_train, y_train, cv=cv, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import brier_score_loss, average_precision_score, roc_auc_score

pr_auc = round(average_precision_score(y_train, p_final), 4)

# ROC-AUC
roc_auc = round(roc_auc_score(y_train, p_final), 4)

print(f"PR-AUC (AP): {pr_auc}")
print(f"ROC-AUC: {roc_auc}")