In [None]:
import sys, os
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42
SPLIT_SEED = 5

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from src.preprocessing import build_preprocessing
# Build and fit the preprocessing pipeline
preprocessing = build_preprocessing(num_cols, cat_cols, remainder="drop")
Xt = preprocessing.fit_transform(X) 
Xt.head()

In [None]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


# Build a full pipeline with preprocessing and model
full_pipeline = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", LogisticRegression(max_iter=1000))
])

# Fit the full pipeline
full_pipeline.fit(X, y)

In [None]:
# Evaluate model performance using cross-validation
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=SPLIT_SEED, shuffle=True, random_state=RANDOM_STATE)

In [None]:
from sklearn.model_selection import cross_val_predict
# Get cross-validated predictions
y_pred = cross_val_predict(full_pipeline, X, y, cv=SPLIT_SEED)

# Check if the model has a decision function
hasattr(full_pipeline, "decision_function")

In [None]:
# Check if the model has a predict_proba method
hasattr(full_pipeline, "predict_proba")