In [None]:
import sys, os
from pathlib import Path

# In Jupyter notebooks, __file__ is not defined. Use Path.cwd().parent for PROJECT_ROOT.
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))           
sys.path.append(str(PROJECT_ROOT / "src"))   

RANDOM_STATE = 42

print("PYTHONPATH patched:", sys.path[-2:]) 

In [None]:
import pandas as pd
TARGET = "Survived" 
df_raw = pd.read_csv('../data/raw/Titanic-Dataset.csv')
X = df_raw.drop(columns=[TARGET])
y = df_raw[TARGET]

In [None]:
# Split the dataset into training and testing sets with stratification
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# Define numerical and categorical columns
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Sex", "Pclass", "Embarked"]


In [None]:
from src.preprocessing import build_preprocessing_hgb_native
# Build the preprocessing pipeline
preprocessing, cat_idx = build_preprocessing_hgb_native(num_cols, cat_cols)
Xt = preprocessing.fit_transform(X_train)


In [None]:
from scipy.stats import randint, uniform  # Use scipy.stats for distributions
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV


hgb = HistGradientBoostingClassifier(categorical_features=cat_idx, random_state=RANDOM_STATE)
pipe = Pipeline([("preprocess", preprocessing), ("model", hgb)])

param_distributions = {
    "model__learning_rate": uniform(0.01, 0.19),  
    "model__max_leaf_nodes": randint(15, 50),     
    "model__min_samples_leaf": randint(5, 30),    
    "model__max_iter": randint(100, 600),        
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

random_search = RandomizedSearchCV(
    estimator=pipe,  # Use the pipeline as the estimator
    param_distributions=param_distributions,
    n_iter=40,
    scoring="average_precision",
    cv=skf,
    n_jobs=-1,
    refit=True,
    random_state=RANDOM_STATE,
)

# Fit the RandomizedSearchCV object
random_search.fit(X_train, y_train)

# Access the best parameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_
best_mean_cv_ap = random_search.cv_results_['mean_test_score'][random_search.best_index_]


print("Best Parameters:", best_params)
print("Best Score:", best_score)
print("Best Mean CV Average Precision:", best_mean_cv_ap)


In [None]:
import pandas as pd

# 1) Convert cv_results_ dict to a DataFrame
cv_results_df = pd.DataFrame(random_search.cv_results_)

# 2) Sort rows by mean_test_score in descending order
cv_results_sorted = cv_results_df.sort_values(
    by="mean_test_score",
    ascending=False
)

# 3) Select top 10 rows for inspection
top10 = cv_results_sorted.head(10)

# 4) Display only the most relevant columns
cols_to_show = [
    "mean_test_score",
    "std_test_score",
    "mean_fit_time",
    "param_model__learning_rate",
    "param_model__max_leaf_nodes",
    "param_model__min_samples_leaf",
    "param_model__max_iter",
]
top10[cols_to_show]

# Save full cv_results_ to CSV for later analysis
cv_results_sorted.to_csv(
    "../reports/tuning/cv_results_hgb_random_search.csv",
    index=False
)

