In [None]:
import sys, os
from pathlib import Path

# In Jupyter notebooks, __file__ is not defined. Use Path.cwd().parent for PROJECT_ROOT.
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]

In [None]:
from src.preprocessing import build_preprocessing
# Build the preprocessing pipeline
preprocessing = build_preprocessing(num_cols, cat_cols)
Xt = preprocessing.fit_transform(X_train) 
Xt.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", RandomForestClassifier(random_state=RANDOM_STATE)) # Use RandomForestClassifier
])

rf_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

y_pred = cross_val_predict(rf_pipeline, X_train, y_train, cv=skf)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

prec, rec, f1 = precision_score(y_train, y_pred), recall_score(y_train, y_pred), f1_score(y_train, y_pred)
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

oof_proba = cross_val_predict(rf_pipeline, X_train, y_train, cv=skf, method="predict_proba")[:, 1]

# Build PR curve points from OOF probabilities
precision, recall, thresholds = precision_recall_curve(y_train, oof_proba)
thr_ext = np.r_[0.0, thresholds]  
print(len(precision), len(recall), len(thr_ext))

In [None]:
from src.choose_threshold import choose_threshold

chosen_thr, strategy, metrics = choose_threshold(
    oof_proba=oof_proba,
    y_train=y_train,
    precision=precision,
    recall=recall,
    thresholds=thresholds,
    target_precision=0.85
)

In [None]:
from pathlib import Path
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score

# Generate OOF predictions using the chosen threshold
oof_pred = (oof_proba >= chosen_thr).astype(int)

# Compute evaluation metrics at the chosen threshold
cm = confusion_matrix(y_train, oof_pred)
prec_at = precision_score(y_train, oof_pred, zero_division=0)
rec_at  = recall_score(y_train, oof_pred, zero_division=0)
f1_at   = f1_score(y_train, oof_pred, zero_division=0)

# Compute AUC metrics
ap_oof  = average_precision_score(y_train, oof_proba)  # PR-AUC (AP)
roc_oof = roc_auc_score(y_train,oof_proba)            # ROC-AUC

print("Confusion matrix @thr:\n", cm)
print(f"OOF @thr -> Precision={prec_at:.3f} | Recall={rec_at:.3f} | F1={f1_at:.3f}")
print(f"OOF AUCs -> PR-AUC(AP)={ap_oof:.3f} | ROC-AUC={roc_oof:.3f}")

In [None]:
# Imports
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate

# Define Stratified K-Fold cross-validation
scoring = {
    "roc_auc": "roc_auc",
    "ap": "average_precision",  # AP = area under PR curve
}

# Perform cross-validation with multiple metrics
cv_res_rf = cross_validate(
    rf_pipeline,          
    X_train, y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

# Calculate mean and std for each metric
roc_mean = cv_res_rf["test_roc_auc"].mean()
roc_std  = cv_res_rf["test_roc_auc"].std()
ap_mean  = cv_res_rf["test_ap"].mean()
ap_std   = cv_res_rf["test_ap"].std()

print(f"RF | ROC-AUC: {roc_mean:.3f} ± {roc_std:.3f} | AP: {ap_mean:.3f} ± {ap_std:.3f}")


In [None]:
import numpy as np
from pathlib import Path

Path("reports").mkdir(parents=True, exist_ok=True)

np.save("../reports/threshold_rf.npy", np.array([chosen_thr], dtype=float))

thr_loaded = float(np.load("../reports/threshold_rf.npy")[0])
print(f"Saved RF threshold: {thr_loaded:.3f}")
