In [None]:
import sys, os
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from src.preprocessing import build_preprocessing
# Build and fit the preprocessing pipeline
preprocessing = build_preprocessing(num_cols, cat_cols, remainder="drop")
Xt = preprocessing.fit_transform(X_train) 
Xt.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


# Build a full pipeline with preprocessing and model
full_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", LogisticRegression(max_iter=1000))
])

# Fit the full pipeline
full_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Define Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Get cross-validated predictions
y_pred = cross_val_predict(full_pipeline, X_train, y_train, cv=skf)

# Check if the model has a decision function
hasattr(full_pipeline, "decision_function") #True
hasattr(full_pipeline, "predict_proba") #True

In [None]:
# Get cross-validated decision function scores
y_scores_cv = cross_val_predict(full_pipeline, X_train, y_train, cv=skf, method="predict_proba")[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

prec, rec, f1 = precision_score(y_train, y_pred), recall_score(y_train, y_pred), f1_score(y_train, y_pred)
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

We optimize precision for the "survived" class because false positive cases (FP) lead to a misallocation of resources/priorities.

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_train, y_scores_cv)
plt.fill_between(recall, precision)
plt.ylabel("Precision")
plt.xlabel("Recall")
plt.title("Train Precision-Recall curve");