In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]

In [None]:
from src.preprocessing import build_preprocessing

preprocessing = build_preprocessing(num_cols, cat_cols, remainder="drop")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


# Build a full pipeline with preprocessing and model
full_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", LogisticRegression(max_iter=1000))
])

# Fit the full pipeline
full_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Define Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predictions
y_pred = cross_val_predict(full_pipeline, X_train, y_train, cv=skf)

# Check if the model has a decision function
hasattr(full_pipeline, "decision_function") #True
hasattr(full_pipeline, "predict_proba") #True

In [None]:
# Get cross-validated decision function scores
oof_proba = cross_val_predict(full_pipeline, X_train, y_train, cv=skf, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

prec, rec, f1 = precision_score(y_train, y_pred), recall_score(y_train, y_pred), f1_score(y_train, y_pred)
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

We optimize precision for the "survived" class because false positive cases (FP) lead to a misallocation of resources/priorities.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc

# Assuming y_train and oof_proba are already defined
precision, recall, _ = precision_recall_curve(y_train, oof_proba)

# Calculate PR-AUC
pr_auc = auc(recall, precision)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR Curve (AUC = {pr_auc:.2f})", color="blue")
plt.fill_between(recall, precision, alpha=0.2, color="blue")  # Shade the area under the curve
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="lower left")
plt.grid(alpha=0.3)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming y_train and oof_proba are already defined
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_train, oof_proba)

# Calculate ROC-AUC
roc_auc = roc_auc_score(y_train, oof_proba)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color="blue", linewidth=2)
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")  # Diagonal line
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Build PR curve points from OOF probabilities
precision, recall, thresholds = precision_recall_curve(y_train, oof_proba)
thr_ext = np.r_[0.0, thresholds] # Extend thresholds to match precision and recall lengths
print(len(precision), len(recall), len(thr_ext))



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Given: precision, recall, thresholds from precision_recall_curve
TARGET_PRECISION = 0.85

# Find indices where precision >= target (exclude i=0 since it has no corresponding threshold)
mask = (precision >= TARGET_PRECISION)
cand_idx = np.where(mask)[0][1:]  # Exclude i=0

if cand_idx.size > 0:
    # Pick the candidate with max recall among those meeting precision target
    chosen_idx = cand_idx[np.argmax(recall[cand_idx])]
    chosen_thr = thresholds[chosen_idx - 1]  # Map i -> thresholds[i-1]
    strategy = f"precision≥{TARGET_PRECISION:.2f} → max recall"
else:
    # Fallback: choose threshold that maximizes F1 (ignore i=0)
    f1_curve = 2 * (precision * recall) / (precision + recall + 1e-12)
    valid = np.arange(1, len(precision))  # Ignore i=0
    chosen_idx = valid[np.nanargmax(f1_curve[valid])]
    chosen_thr = thresholds[chosen_idx - 1]
    strategy = f"max F1 (target precision {TARGET_PRECISION:.2f} unattainable on OOF)"

# Print strategy and chosen threshold
print("Strategy:", strategy)
print("Chosen index:", chosen_idx)
print("Chosen threshold:", round(chosen_thr, 3))
print("Point on PR: precision=", round(precision[chosen_idx], 3),
      "recall=", round(recall[chosen_idx], 3))

# Verify recomputed metrics on the same OOF scores
y_hat = (oof_proba >= chosen_thr).astype(int)
print("Recomputed on OOF: ",
      "precision=", round(precision_score(y_train, y_hat), 6),
      "recall=", round(recall_score(y_train, y_hat), 6),
      "f1=", round(f1_score(y_train, y_hat), 6))

In [None]:
from pathlib import Path
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score

# Save the chosen threshold
Path("../reports").mkdir(parents=True, exist_ok=True)
np.save("../reports/threshold.npy", np.array([chosen_thr]))

# Generate OOF predictions using the chosen threshold
oof_pred = (oof_proba >= chosen_thr).astype(int)

# Compute evaluation metrics at the chosen threshold
cm = confusion_matrix(y_train, oof_pred)
prec_at = precision_score(y_train, oof_pred, zero_division=0)
rec_at  = recall_score(y_train, oof_pred, zero_division=0)
f1_at   = f1_score(y_train, oof_pred, zero_division=0)

# Compute AUC metrics
ap_oof  = average_precision_score(y_train, oof_proba)  # PR-AUC (AP)
roc_oof = roc_auc_score(y_train,oof_proba)            # ROC-AUC

print("Confusion matrix @thr:\n", cm)
print(f"OOF @thr -> Precision={prec_at:.3f} | Recall={rec_at:.3f} | F1={f1_at:.3f}")
print(f"OOF AUCs -> PR-AUC(AP)={ap_oof:.3f} | ROC-AUC={roc_oof:.3f}")


In [None]:
# === Setup & imports (run once) ===
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import confusion_matrix, precision_recall_curve

# Consistent seaborn style
sns.set(context="notebook", style="whitegrid")


In [None]:
# === 1) Confusion matrices: baseline @0.50 and chosen @t* (raw and normalized) ===

thr_baseline = 0.50

# Predictions @ thresholds
oof_pred_050 = (oof_proba >= thr_baseline).astype(int)
oof_pred_thr = (oof_proba >= chosen_thr).astype(int)

# Raw confusion matrices
cm_050 = confusion_matrix(y_train, oof_pred_050)
cm_thr = confusion_matrix(y_train, oof_pred_thr)

# Normalized by true class (rows sum to 1)
cm_050_norm = confusion_matrix(y_train, oof_pred_050, normalize="true")
cm_thr_norm = confusion_matrix(y_train, oof_pred_thr, normalize="true")

# Display raw (counts)
plt.figure(figsize=(4.5, 4))
ax = sns.heatmap(
    cm_050, annot=True, fmt="d", cmap="Blues", cbar=False,
    xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"],
    annot_kws={"fontsize": 11}
)
ax.set_title(f"Confusion Matrix OOF @0.50")
ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")
plt.tight_layout()
plt.show()

plt.figure(figsize=(4.5, 4))
ax = sns.heatmap(
    cm_thr, annot=True, fmt="d", cmap="Blues", cbar=False,
    xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"],
    annot_kws={"fontsize": 11}
)
ax.set_title(f"Confusion Matrix OOF @{chosen_thr:.3f}")
ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")
plt.tight_layout()
plt.show()

# Display normalized (rates)
plt.figure(figsize=(4.5, 4))
ax = sns.heatmap(
    cm_050_norm, annot=True, fmt=".2f", cmap="Blues", cbar=False,
    xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"],
    annot_kws={"fontsize": 11}
)
ax.set_title(f"Confusion Matrix OOF @0.50 (normalized)")
ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")
plt.tight_layout()
plt.show()

plt.figure(figsize=(4.5, 4))
ax = sns.heatmap(
    cm_thr_norm, annot=True, fmt=".2f", cmap="Blues", cbar=False,
    xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"],
    annot_kws={"fontsize": 11}
)
ax.set_title(f"Confusion Matrix OOF @{chosen_thr:.3f} (normalized)")
ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")
plt.tight_layout()
plt.show()

In [None]:
# === 2) PR curve with markers for 0.50 and chosen threshold ===
# This creates a copy of your PR curve but with points highlighted at both thresholds.

prec, rec, thr = precision_recall_curve(y_train, oof_proba)
thr_ext = np.r_[0.0, thr]  # align sizes: len(prec)==len(rec)==len(thr_ext)

# Helper: nearest index on the curve to a given threshold value
def nearest_idx_to_threshold(threshold_value, thr_extended):
    return int(np.argmin(np.abs(thr_extended - threshold_value)))

idx_050 = nearest_idx_to_threshold(thr_baseline, thr_ext)
idx_thr = nearest_idx_to_threshold(chosen_thr, thr_ext)

plt.figure(figsize=(6, 4.5))
plt.plot(rec, prec, label="PR (OOF)")

# Mark baseline point
plt.scatter(rec[idx_050], prec[idx_050], s=60, marker="o", label=f"@0.50  (P={prec[idx_050]:.2f}, R={rec[idx_050]:.2f})")

# Mark chosen threshold point
plt.scatter(rec[idx_thr], prec[idx_thr], s=70, marker="s", label=f"@{chosen_thr:.3f} (P={prec[idx_thr]:.2f}, R={rec[idx_thr]:.2f})")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR curve (OOF) with threshold markers")
plt.legend(loc="lower left")
plt.tight_layout()
