In [None]:
import sys, os
from pathlib import Path

# In Jupyter notebooks, __file__ is not defined. Use Path.cwd().parent for PROJECT_ROOT.
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from src.preprocessing import build_preprocessing
# Build the preprocessing pipeline
preprocessing = build_preprocessing(num_cols, cat_cols)
Xt = preprocessing.fit_transform(X_train) 
Xt.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Build a full pipeline with preprocessing and model
RF_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", RandomForestClassifier(random_state=RANDOM_STATE))
])

# Fit the full pipeline
RF_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Define Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predictions (for inspection only)
y_pred = cross_val_predict(RF_pipeline, X_train, y_train, cv=skf)

In [None]:
# Get cross-validated decision function scores
oof_proba = cross_val_predict(RF_pipeline, X_train, y_train, cv=skf, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

prec, rec, f1 = precision_score(y_train, y_pred), recall_score(y_train, y_pred), f1_score(y_train, y_pred)
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

We optimize precision for the "survived" class because false positive cases (FP) lead to a misallocation of resources/priorities.

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Build PR curve points from OOF probabilities
precision, recall, thresholds = precision_recall_curve(y_train, oof_proba)
thr_ext = np.r_[0.0, thresholds]  # precision_recall_curve returns n+1 precision/recall values but only n thresholds; prepending 0.0 aligns their lengths for analysis/plotting
print(len(precision), len(recall), len(thr_ext))

In [None]:
TARGET_PRECISION = 0.85

mask = precision >= TARGET_PRECISION
print("How many points meet the target precision:", mask.sum())

# Select the best threshold based on the target precision
if mask.any():
    candidate_idx = np.where(mask)[0] # indices where precision >= TARGET_PRECISION
    best_local = candidate_idx[np.argmax(recall[candidate_idx])] # index with max recall among candidates
    chosen_idx = int(best_local) # convert to native int for later use
    chosen_thr = float(thr_ext[chosen_idx]) # convert to native float for later use
    strategy = f"precision≥{TARGET_PRECISION:.2f} → max recall" # description of the strategy used
else:
    # Fallback: choose threshold that gives max F1 score
    from numpy import nanargmax
    f1 = 2 * (precision * recall) / (precision + recall + 1e-12) # F1 score calculation
    chosen_idx = int(nanargmax(f1)) # index of max F1 score
    chosen_thr = float(thr_ext[chosen_idx]) # corresponding threshold
    strategy = f"max F1 (target precision {TARGET_PRECISION:.2f} unattainable on OOF)" 

print("Strategy:", strategy)
print("Chosen index:", chosen_idx)
print("Chosen threshold:", round(chosen_thr, 3))
print("Point on PR: precision=", round(precision[chosen_idx],3), "recall=", round(recall[chosen_idx],3))


In [None]:
from pathlib import Path
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score

# Generate OOF predictions using the chosen threshold
oof_pred = (oof_proba >= chosen_thr).astype(int)

# Compute evaluation metrics at the chosen threshold
cm = confusion_matrix(y_train, oof_pred)
prec_at = precision_score(y_train, oof_pred, zero_division=0)
rec_at  = recall_score(y_train, oof_pred, zero_division=0)
f1_at   = f1_score(y_train, oof_pred, zero_division=0)

# Compute AUC metrics
ap_oof  = average_precision_score(y_train, oof_proba)  # PR-AUC (AP)
roc_oof = roc_auc_score(y_train,oof_proba)            # ROC-AUC

print("Confusion matrix @thr:\n", cm)
print(f"OOF @thr -> Precision={prec_at:.3f} | Recall={rec_at:.3f} | F1={f1_at:.3f}")
print(f"OOF AUCs -> PR-AUC(AP)={ap_oof:.3f} | ROC-AUC={roc_oof:.3f}")


In [None]:
# Imports
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate

# Define Stratified K-Fold cross-validation
scoring = {
    "roc_auc": "roc_auc",
    "ap": "average_precision",  # AP = area under PR curve
}

# Perform cross-validation with multiple metrics
cv_res_rf = cross_validate(
    RF_pipeline,           # your RF pipeline object
    X_train, y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

# Calculate mean and std for each metric
roc_mean = cv_res_rf["test_roc_auc"].mean()
roc_std  = cv_res_rf["test_roc_auc"].std()
ap_mean  = cv_res_rf["test_ap"].mean()
ap_std   = cv_res_rf["test_ap"].std()

print(f"RF | ROC-AUC: {roc_mean:.3f} ± {roc_std:.3f} | AP: {ap_mean:.3f} ± {ap_std:.3f}")


In [None]:
import numpy as np
from pathlib import Path

Path("reports").mkdir(parents=True, exist_ok=True)

np.save("../reports/threshold_rf.npy", np.array([chosen_thr], dtype=float))

thr_loaded = float(np.load("../reports/threshold_rf.npy")[0])
print(f"Saved RF threshold: {thr_loaded:.3f}")
