In [None]:
import sys, os
from pathlib import Path

# In Jupyter notebooks, __file__ is not defined. Use Path.cwd().parent for PROJECT_ROOT.
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from src.preprocessing import build_preprocessing_hgb_native

# Build preprocessing pipeline and get categorical indices
preprocessing, cat_idx = build_preprocessing_hgb_native(num_cols, cat_cols)

# Define the final model with optimized hyperparameters
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=150, 
    max_leaf_nodes=30,
    min_samples_leaf=21,
    categorical_features=cat_idx,
    random_state=RANDOM_STATE
)

# Create the final pipeline
final_pipeline = Pipeline([("preprocess", preprocessing), ("model", hgb_final)])

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)

# -------------------------------------------------------------------
# 0. Paths and folders
# -------------------------------------------------------------------
REPORTS_DIR = Path("../reports")
FIGURES_DIR = REPORTS_DIR / "figures"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------------
# 1. Fit final pipeline on full training data
# -------------------------------------------------------------------

final_pipeline.fit(X_train, y_train)

# -------------------------------------------------------------------
# 2. Predict probabilities on the test set
# -------------------------------------------------------------------

y_proba_test = final_pipeline.predict_proba(X_test)[:, 1]


# -------------------------------------------------------------------
# 3. Load final threshold and get class predictions
# -------------------------------------------------------------------
threshold_path = REPORTS_DIR / "threshold_final.npy"
threshold = float(np.load(threshold_path)[0]) 

y_pred_test = (y_proba_test >= threshold).astype(int)

# -------------------------------------------------------------------
# 4. Compute metrics on test
# -------------------------------------------------------------------
# Ranking metrics (do not depend on threshold)
roc_auc_test = roc_auc_score(y_test, y_proba_test)
pr_auc_test = average_precision_score(y_test, y_proba_test)
brier_test = brier_score_loss(y_test, y_proba_test)

# Threshold-based metrics
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
precision_test = precision_score(y_test, y_pred_test, zero_division=0)
recall_test = recall_score(y_test, y_pred_test, zero_division=0)
f1_test = f1_score(y_test, y_pred_test, zero_division=0)
accuracy_test = accuracy_score(y_test, y_pred_test)

# -------------------------------------------------------------------
# 5. Save ROC and PR curves for test
# -------------------------------------------------------------------
plt.figure()
RocCurveDisplay.from_predictions(y_test, y_proba_test)
plt.title("ROC curve (test)")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "roc_test.png", dpi=150)
plt.close()

plt.figure()
PrecisionRecallDisplay.from_predictions(y_test, y_proba_test)
plt.title("Precision-Recall curve (test)")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "pr_test.png", dpi=150)
plt.close()

# -------------------------------------------------------------------
# 6. Save per-sample predictions for test
# -------------------------------------------------------------------
test_pred_df = pd.DataFrame(
    {
        "y_true": y_test,
        "y_proba": y_proba_test,
        "y_pred": y_pred_test,
    },
    index=getattr(X_test, "index", None),  
)
test_pred_df.to_csv(REPORTS_DIR / "test_predictions.csv", index=True)

# -------------------------------------------------------------------
# 7. Save metrics to markdown file
# -------------------------------------------------------------------
test_md_path = REPORTS_DIR / "test_metrics.md"

with open(test_md_path, "w", encoding="utf-8") as f:
    f.write("# Test set evaluation\n\n")
    f.write("## Ranking metrics (probabilities)\n\n")
    f.write(f"- ROC-AUC (test): {roc_auc_test:.4f}\n")
    f.write(f"- PR-AUC / Average Precision (test): {pr_auc_test:.4f}\n")
    f.write(f"- Brier score (test): {brier_test:.4f}\n\n")

    f.write(f"## Classification metrics at threshold = {threshold:.3f}\n\n")
    f.write(f"- Precision: {precision_test:.4f}\n")
    f.write(f"- Recall: {recall_test:.4f}\n")
    f.write(f"- F1-score: {f1_test:.4f}\n")
    f.write(f"- Accuracy: {accuracy_test:.4f}\n\n")

    f.write("### Confusion matrix (test)\n\n")
    f.write("|        | Pred 0 | Pred 1 |\n")
    f.write("|--------|--------|--------|\n")
    f.write(f"| True 0 | {tn:6d} | {fp:6d} |\n")
    f.write(f"| True 1 | {fn:6d} | {tp:6d} |\n")

print("Done. Test metrics and figures saved to 'reports/'.")
