In [None]:
import sys
from pathlib import Path

# Find project root = directory that contains "src"
cwd = Path.cwd()
root = cwd
while root != root.parent and not (root / "src").exists():
    root = root.parent

if not (root / "src").exists():
    raise RuntimeError(f"Could not find 'src' directory starting from {cwd}")

# Add project root to sys.path (NOT src itself)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

print("CWD:", cwd)
print("PROJECT_ROOT:", root)
print("Has src?:", (root / "src").exists())
print("Last sys.path entries:", sys.path[:5])

RANDOM_STATE = 42


In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv(root / 'data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
cat_cols = ["Sex", "Embarked"]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from src.preprocessing import build_preprocessing_hgb_native

# Build preprocessing pipeline and get categorical indices
preprocessing, cat_idx = build_preprocessing_hgb_native(num_cols, cat_cols)

# Define the final model with optimized hyperparameters
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=150, 
    max_leaf_nodes=30,
    min_samples_leaf=21,
    categorical_features=cat_idx,
    random_state=RANDOM_STATE
)

# Create the final pipeline
pipe_final = Pipeline([("preprocess", preprocessing), ("model", hgb_final)])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict

# Perform cross-validated predictions on the training set
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predicted probabilities
p_final = cross_val_predict(pipe_final, X_train, y_train, cv=cv, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import average_precision_score, roc_auc_score

pr_auc = round(average_precision_score(y_train, p_final), 4)

# ROC-AUC
roc_auc = round(roc_auc_score(y_train, p_final), 4)

print(f"PR-AUC (AP): {pr_auc}")
print(f"ROC-AUC: {roc_auc}")

In [None]:
df_oof = pd.read_csv("../reports/train_oof_leader.csv")

y_train = df_oof["y_true"].to_numpy()
p_final = df_oof["p_pred"].to_numpy()

print(y_train.shape, p_final.shape)


In [None]:
from pathlib import Path
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# define thresholds grid
thresholds = np.arange(0.0, 1.01, 0.01)

rows = []

for t in thresholds:
    # binary predictions at threshold t
    y_pred = (p_final >= t).astype(int)
    
    # compute metrics at this threshold
    precision = precision_score(y_train, y_pred, zero_division=0)
    recall = recall_score(y_train, y_pred, zero_division=0)
    f1 = f1_score(y_train, y_pred, zero_division=0)
    
    rows.append({
        "threshold": t,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    })

df_thr = pd.DataFrame(rows)

# ensure reports folder exists
Path("reports").mkdir(parents=True, exist_ok=True)

# save metrics table
df_thr.to_csv("../reports/threshold_metrics_oof.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

# Precision vs threshold
plt.figure()
plt.plot(df_thr["threshold"], df_thr["precision"])
plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.title("Precision vs Threshold (OOF)")
plt.grid(True)
plt.tight_layout()
plt.savefig("../reports/figures/precision_vs_threshold_oof_leader.png", dpi=150)

# Recall vs threshold
plt.figure()
plt.plot(df_thr["threshold"], df_thr["recall"])
plt.xlabel("Threshold")
plt.ylabel("Recall")
plt.title("Recall vs Threshold (OOF)")
plt.grid(True)
plt.tight_layout()
plt.savefig("../reports/figures/recall_vs_threshold_oof_leader.png", dpi=150)

# F1 vs threshold
plt.figure()
plt.plot(df_thr["threshold"], df_thr["f1"])
plt.xlabel("Threshold")
plt.ylabel("F1 score")
plt.title("F1 vs Threshold (OOF)")
plt.grid(True)
plt.tight_layout()
plt.savefig("../reports/figures/f1_vs_threshold_oof_leader.png", dpi=150)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_train, p_final)
thr_ext = np.r_[0.0, thresholds]
TARGET_PRECISION = 0.85

# Find indices where precision >= target (exclude i=0 since it has no corresponding threshold)
mask = (precision >= TARGET_PRECISION)
cand_idx = np.where(mask)[0][1:]  # Exclude i=0

if cand_idx.size > 0:
    # Pick the candidate with max recall among those meeting precision target
    chosen_idx = cand_idx[np.argmax(recall[cand_idx])]
    chosen_thr = thresholds[chosen_idx - 1]  # Map i -> thresholds[i-1]
    strategy = f"precision≥{TARGET_PRECISION:.2f} → max recall"
else:
    # Fallback: choose threshold that maximizes F1 (ignore i=0)
    f1_curve = 2 * (precision * recall) / (precision + recall + 1e-12)
    valid = np.arange(1, len(precision))  # Ignore i=0
    chosen_idx = valid[np.nanargmax(f1_curve[valid])]
    chosen_thr = thresholds[chosen_idx - 1]
    strategy = f"max F1 (target precision {TARGET_PRECISION:.2f} unattainable on OOF)"

# Print strategy and chosen threshold
print("Strategy:", strategy)
print("Chosen index:", chosen_idx)
print("Chosen threshold:", round(chosen_thr, 3))
print("Point on PR: precision=", round(precision[chosen_idx], 3),
      "recall=", round(recall[chosen_idx], 3))

In [None]:
Path("../reports").mkdir(parents=True, exist_ok=True)

np.save("../reports/threshold_final.npy", np.array([chosen_thr], dtype=float))

thr_loaded = float(np.load("../reports/threshold_final.npy")[0])
print(f"Saved Final threshold: {thr_loaded:.3f}")