In [None]:
import sys, os
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from src.preprocessing import build_preprocessing
# Build and fit the preprocessing pipeline
preprocessing = build_preprocessing(num_cols, cat_cols, remainder="drop")
Xt = preprocessing.fit_transform(X_train) 
Xt.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


# Build a full pipeline with preprocessing and model
full_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", LogisticRegression(max_iter=1000))
])

# Fit the full pipeline
full_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Define Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predictions
y_pred = cross_val_predict(full_pipeline, X_train, y_train, cv=skf)

# Check if the model has a decision function
hasattr(full_pipeline, "decision_function") #True
hasattr(full_pipeline, "predict_proba") #True

In [None]:
# Get cross-validated decision function scores
y_scores_cv = cross_val_predict(full_pipeline, X_train, y_train, cv=skf, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

prec, rec, f1 = precision_score(y_train, y_pred), recall_score(y_train, y_pred), f1_score(y_train, y_pred)
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

We optimize precision for the "survived" class because false positive cases (FP) lead to a misallocation of resources/priorities.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc

# Assuming y_train and y_scores_cv are already defined
precision, recall, _ = precision_recall_curve(y_train, y_scores_cv)

# Calculate PR-AUC
pr_auc = auc(recall, precision)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR Curve (AUC = {pr_auc:.2f})", color="blue")
plt.fill_between(recall, precision, alpha=0.2, color="blue")  # Shade the area under the curve
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="lower left")
plt.grid(alpha=0.3)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming y_train and y_scores_cv are already defined
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores_cv)

# Calculate ROC-AUC
roc_auc = roc_auc_score(y_train, y_scores_cv)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color="blue", linewidth=2)
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")  # Diagonal line
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Build PR curve points from OOF probabilities
precision, recall, thresholds = precision_recall_curve(y_train, y_scores_cv)
thr_ext = np.r_[0.0, thresholds]
print(len(precision), len(recall), len(thr_ext))



In [None]:
TARGET_PRECISION = 0.85

mask = precision >= TARGET_PRECISION
print("How many points meet the target precision:", mask.sum())

if mask.any():
    candidate_idx = np.where(mask)[0]
    best_local = candidate_idx[np.argmax(recall[candidate_idx])]
    chosen_idx = int(best_local)
    chosen_threshold = float(thr_ext[chosen_idx])
    strategy = f"precision≥{TARGET_PRECISION:.2f} → max recall"
else:
    from numpy import nanargmax
    f1 = 2 * (precision * recall) / (precision + recall + 1e-12)
    chosen_idx = int(nanargmax(f1))
    chosen_thr = float(thr_ext[chosen_idx])
    strategy = f"max F1 (target precision {TARGET_PRECISION:.2f} unattainable on OOF)"

print("Strategy:", strategy)
print("Chosen index:", chosen_idx)
print("Chosen threshold:", round(chosen_thr, 3))
print("Point on PR: precision=", round(precision[chosen_idx],3), "recall=", round(recall[chosen_idx],3))
