# Hyperparameter optimization `no dia` vs (`pre`, `dia`)

## Init config

In [None]:
ordinal_orders = {
    "GenHlth": ["excellent", "very good", "good", "fair", "poor"],
    "Age": [
        "18-24",
        "25-29",
        "30-34",
        "35-39",
        "40-44",
        "45-49",
        "50-54",
        "55-59",
        "60-64",
        "65-69",
        "70-74",
        "75-79",
        "80+",
    ],
    "Education": [
        "no school",
        "elementary",
        "some high school",
        "high school graduate",
        "college",
        "college graduate",
    ],
    "Income": ["<$10k", "<$15k", "<$20k", "<$25k", "<$35k", "<$50k", "<$75k", ">$75k"],
}

In [None]:
import os
from datetime import datetime
import time
import pandas as pd
import numpy as np
import pickle
import json

from sklearn.model_selection import cross_val_score, StratifiedKFold

from src.config import (
    DATA_SPLIT_DIR,
    TRAIN_RAW_FILENAME,
    VALIDATION_RAW_FILENAME,
    MODELS_DIR,
    STUDY_DIR,
    MODEL_ALIASES
)
from src.model_evaluation import evaluate_classifier

os.makedirs(STUDY_DIR, exist_ok=True)

# For Bayesian Optimization
import optuna
from optuna.samplers import TPESampler

## Bayesian Optimization


### Pipeline

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from src.transformers import MissingFlagTransformer, CategoryFromThresholdTransformer

nominal_cols = [
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "DiffWalk",
    "Sex",
]
nominal_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        (
            "ohe",
            OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False),
        ),
    ]
)

ordinal_cols = ["GenHlth", "Age", "Education", "Income"]

ordinal_categories = [ordinal_orders[col] for col in ordinal_cols]

ordinal_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder(categories=ordinal_categories)),
    ]
)


numeric_cols = []  # ["MentHlth", "PhysHlth"]  # "BMI",
num_pipe = Pipeline(
    [("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())]
)

missing_val_cols = numeric_cols + ordinal_cols + nominal_cols
missing_val_pipe = Pipeline(
    [
        (
            "flags",
            MissingFlagTransformer(),
        ),
    ]
)


cat_gens = {"BMI": [20, 25, 30], "MentHlth": [0, 5, 10], "PhysHlth": [0, 5, 10]}
cat_gen_cols = ["BMI", "MentHlth", "PhysHlth"]
cat_gen_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="median")),
        (
            "cat_gen",
            CategoryFromThresholdTransformer([cat_gens[c] for c in cat_gen_cols]),
        ),
        (
            "enc",
            OrdinalEncoder(),
        ),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("ord", ordinal_pipe, ordinal_cols),
        ("nom", nominal_pipe, nominal_cols),
        ("miss", missing_val_pipe, missing_val_cols),
        ("cg", cat_gen_pipe, cat_gen_cols),
    ],
    remainder="drop",
)

classifier = RandomForestClassifier(class_weight="balanced", random_state=0)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "sample",
            RandomOverSampler(
                random_state=5,
            ),
        ),
        ("clf", RandomForestClassifier()),
    ]
)


preprocessor.set_output(transform="pandas")

## Load and transform train and validation data

In [None]:
from sklearn.preprocessing import LabelEncoder


df_train_raw = pd.read_csv(os.path.join(DATA_SPLIT_DIR, TRAIN_RAW_FILENAME))
features_train_raw = df_train_raw.drop("Diabetes_012", axis=1)
target_train_raw = df_train_raw["Diabetes_012"].replace({"pre": "dia"})


df_val_raw = pd.read_csv(os.path.join(DATA_SPLIT_DIR, VALIDATION_RAW_FILENAME))
features_val_raw = df_val_raw.drop("Diabetes_012", axis=1)
target_val_raw = df_val_raw["Diabetes_012"].replace({"pre": "dia"})


labelencoder = LabelEncoder()

target_train_enc = labelencoder.fit_transform(target_train_raw)
target_val_enc = labelencoder.transform(target_val_raw)

## Define Study

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
)
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

classifiers = [
    # GaussianNB,
    LogisticRegression,
    # RidgeClassifier,
    # GradientBoostingClassifier,
    # HistGradientBoostingClassifier,
]

In [None]:
cat_gen_thresholds = [
    (
        "cg_bmi_th",
        [
            [30],
            [20, 30],
            [30, 45],
            [20, 30, 40],
            [20, 30, 40, 50],
            [20, 30, 40, 50, 60],
        ],
    ),
    ("cg_mh_th", [[0], [10], [0, 5], [0, 10, 20], [10, 20]]),
    ("cg_ph_th", [[0], [10], [0, 5], [0, 10, 20], [10, 20]]),
]

cat_gen_thresholds = {
    name: {
        f"[{','.join(sorted([str(x) for x in lst]))}]": sorted(lst)
        for lst in thresholds
    }
    for name, thresholds in cat_gen_thresholds
}

In [None]:
def get_objective(model_class):


    def suggest_params(trial, model_name: str) -> dict:


        if model_name == "GaussianNB":
            return {
                "var_smoothing": trial.suggest_float("var_smoothing", 1e-11, 1e-6, log=True)
            }

        elif model_name == "LogisticRegression":
            penalty = trial.suggest_categorical("penalty", ["l2", None])
            return {
                "C": trial.suggest_float("C", 1e-4, 1e2, log=True),
                "penalty": penalty,
                "solver": "lbfgs" if penalty != "none" else "saga",
                "max_iter": 1000,
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
                "random_state": 0,
            }

        elif model_name == "RidgeClassifier":
            return {
                "alpha": trial.suggest_float("alpha", 1e-4, 100.0, log=True),
                "solver": trial.suggest_categorical(
                    "solver", ["auto", "svd", "cholesky", "lsqr", "sag"]
                ),
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
                "tol": trial.suggest_float("tol", 1e-5, 1e-2, log=True),
                "random_state": 0,
            }

        elif model_name == "GradientBoostingClassifier":
            return {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
                "max_depth": trial.suggest_int("max_depth", 2, 10),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "max_features": trial.suggest_categorical(
                    "max_features", ["sqrt", "log2", None]
                ),
                "random_state": 0,
            }

        elif model_name == "HistGradientBoostingClassifier":
            return {
                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
                "max_iter": trial.suggest_int("max_iter", 100, 300),
                "max_depth": trial.suggest_int("max_depth", 3, 15),
                "l2_regularization": trial.suggest_float("l2_regularization", 1e-4, 10.0, log=True),
                "early_stopping": True,
                "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 15, 100),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 50),
                "random_state": 0,
            }

        else:
            raise ValueError(f"Unsupported model: {model_name}")


    def objective(trial):
        """return maximized f1-score"""

        # search space
        pipe_params = {
            cg_name: trial.suggest_categorical(cg_name, choices=list(thresholds.keys()))
            for cg_name, thresholds in cat_gen_thresholds.items()
        }
        pipeline.set_params(
            preprocessor__cg__cat_gen__thresholds=[
                cat_gen_thresholds[cg][cat] for cg, cat in pipe_params.items()
            ]
        )

        params = suggest_params(trial, model_class.__name__)

        # random forest classifier object
        model = model_class(**params)


        pipeline.named_steps["clf"] = model

        # initiating cv
        scores = cross_val_score(
            estimator=pipeline,
            X=features_train_raw,
            y=target_train_enc,
            scoring="f1_macro",
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            n_jobs=-1,
        )

        scores = [score for score in scores if not np.isnan(score)]

        return np.mean(scores)
    
    return objective

## Run study

In [None]:
N_TRIALS = 150

In [None]:
for classifier_cls in classifiers:
    print()
    print("#####################################")
    print(classifier_cls.__name__)
    print()
    model_purpose = ",".join(labelencoder.classes_)
    clf_alias = (
        MODEL_ALIASES[classifier_cls.__name__]
        if classifier_cls in MODEL_ALIASES
        else classifier_cls.__name__
    )
    special_features = "smote"
    study_purpose ="LogisticRegression_smote_dia,no dia"# f"{clf_alias}_{special_features}_{model_purpose}"

    timestamp = datetime.now()

    study_timestamp_str = "20250721160037"
    study_name = f"{study_timestamp_str}_{study_purpose}"


    # create a study (aim to maximize score) und setting a seed (random_state) for reproduceability

    # study = optuna.create_study(
    #     sampler=TPESampler(seed=42),
    #     direction="maximize",
    #     study_name=study_name,
    #     storage=f"sqlite:///{os.path.join(STUDY_DIR, study_name)}.db",
    # )


    # perform hyperparamter tuning (while timing the process)
    time_start = time.time()
    study = optuna.load_study(
        study_name=study_name,
        storage=f"sqlite:///{os.path.join(STUDY_DIR, study_name)}.db",
    )
    obj = get_objective(classifier_cls)
    study.optimize(obj, n_trials=N_TRIALS)

    start_timestamp = datetime.now()

    set_params = study.best_params

    if hasattr(classifier_cls, "random_state"):
        set_params |= {"random_state":0}

    classifier = classifier_cls( **set_params    )

    classifier.fit(features_train_sampled, target_train_sampled_enc)

    train_end_timestamp = datetime.now()
    training_duration = train_end_timestamp - start_timestamp

    ## Prediction
    target_val_pred = classifier.predict(features_val_proc)

    ## Metrics

    target_val_pred_proba = None

    if hasattr(classifier, "predict_proba"):
        target_val_pred_proba = classifier.predict_proba(features_val_proc)

        if target_train_raw.nunique() <= 2:
            target_val_pred_proba = target_val_pred_proba[:, 1]

    results = evaluate_classifier(
        classifier=classifier,
        labels=list(labelencoder.classes_),
        target_truth=target_val_raw,
        target_pred=labelencoder.inverse_transform(target_val_pred),
        target_pred_proba=target_val_pred_proba,
        timestamp=train_end_timestamp,
        model_purpose=model_purpose,
        special_features=special_features,
    )
    results["training_duration"] = training_duration.seconds

    labels = results["predicts"]
    model_name = results["model_name"]

    ### Save the model and results
    folder = os.path.join(MODELS_DIR, model_name)
    filename = os.path.join(folder, model_name)
    os.makedirs(folder, exist_ok=True)

    with open(f"{filename}.model.pkl", "wb") as f:
        pickle.dump(classifier, f)

    with open(f"{filename}.pipeline.pkl", "wb") as f:
        pickle.dump(preprocessor, f)

    with open(f"{filename}.label_encoder.pkl", "wb") as f:
        pickle.dump(labelencoder, f)

    with open(f"{filename}.model.txt", "w") as file:
        file.write(str(classifier))

    with open(f"{filename}.results.json", "w") as f:
        json.dump(results, f, indent=2)

    with open(f"{filename}.pipeline_params.txt", "w") as f:
        f.write(preprocessor.get_params().__str__())

    with open(f"{filename}.model_params.json", "w") as f:
        json.dump(classifier.get_params(), f, indent=2)

    end_timestamp = datetime.now()
    td = end_timestamp - start_timestamp
    print(
        f"training duration {td.days} d {(td.seconds // 3600)} h"
        f" {(td.seconds % 3600) // 60} m {td.seconds % 60} s"
    )