In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]

In [None]:
from src.preprocessing import build_preprocessing

preprocessing = build_preprocessing(num_cols, cat_cols, remainder="drop")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

full_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", LogisticRegression(max_iter=1000))
])

full_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Define Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predictions
y_pred = cross_val_predict(full_pipeline, X_train, y_train, cv=skf)

# Check if the model has a decision function
hasattr(full_pipeline, "decision_function") #True
hasattr(full_pipeline, "predict_proba") #True

In [None]:
# Get cross-validated decision function scores
oof_proba = cross_val_predict(full_pipeline, X_train, y_train, cv=skf, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

prec, rec, f1 = precision_score(y_train, y_pred), recall_score(y_train, y_pred), f1_score(y_train, y_pred)
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc

# Compute precision-recall curve
precision, recall, _ = precision_recall_curve(y_train, oof_proba)

# Compute Area Under the Curve (AUC) for PR curve
pr_auc = auc(recall, precision)

# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR Curve (AUC = {pr_auc:.2f})", color="blue")
plt.fill_between(recall, precision, alpha=0.2, color="blue")  # Shade the area under the curve
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="lower left")
plt.grid(alpha=0.3)
plt.show()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_train, oof_proba)

# Compute Area Under the Curve (AUC) for ROC curve
roc_auc = roc_auc_score(y_train, oof_proba)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color="blue", linewidth=2)
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")  # Diagonal line
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Build PR curve points from OOF probabilities
precision, recall, thresholds = precision_recall_curve(y_train, oof_proba)

# Extend thresholds to match precision and recall lengths
thr_ext = np.r_[0.0, thresholds] 
print(len(precision), len(recall), len(thr_ext))



In [None]:
from src.choose_threshold import choose_threshold

chosen_thr, strategy, metrics = choose_threshold(
    oof_proba=oof_proba,
    y_train=y_train,
    precision=precision,
    recall=recall,
    thresholds=thresholds,
    target_precision=0.85
)


In [None]:
from sklearn.metrics import average_precision_score
from src.evaluate_metrics import evaluate_metrics

# Save the chosen threshold
Path("../reports").mkdir(parents=True, exist_ok=True)
np.save("../reports/threshold.npy", np.array([chosen_thr]))

metrics = evaluate_metrics(oof_proba, chosen_thr, y_train)



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Consistent seaborn style
sns.set(context="notebook", style="whitegrid")


In [None]:
thr_baseline = 0.50

# Predictions @ thresholds
oof_pred_050 = (oof_proba >= thr_baseline).astype(int)
oof_pred_thr = (oof_proba >= chosen_thr).astype(int)

cm_050 = confusion_matrix(y_train, oof_pred_050)
cm_thr = confusion_matrix(y_train, oof_pred_thr)

# Print raw confusion matrices
print("Confusion Matrix OOF @0.50 :")
print(cm_050)

print("\nConfusion Matrix OOF @chosen_thr :")
print(cm_thr)

In [None]:
# This creates a copy of PR curve but with points highlighted at both thresholds.
prec, rec, thr = precision_recall_curve(y_train, oof_proba)
thr_ext = np.r_[0.0, thr]

# Helper: nearest index on the curve to a given threshold value
def nearest_idx_to_threshold(threshold_value, thr_extended):
    return int(np.argmin(np.abs(thr_extended - threshold_value)))

idx_050 = nearest_idx_to_threshold(thr_baseline, thr_ext)
idx_thr = nearest_idx_to_threshold(chosen_thr, thr_ext)

plt.figure(figsize=(6, 4.5))
plt.plot(rec, prec, label="PR (OOF)")

# Mark baseline point
plt.scatter(rec[idx_050], prec[idx_050], s=60, marker="o", label=f"@0.50  (P={prec[idx_050]:.2f}, R={rec[idx_050]:.2f})")

# Mark chosen threshold point
plt.scatter(rec[idx_thr], prec[idx_thr], s=70, marker="s", label=f"@{chosen_thr:.3f} (P={prec[idx_thr]:.2f}, R={rec[idx_thr]:.2f})")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR curve (OOF) with threshold markers")
plt.legend(loc="lower left")
plt.tight_layout()
