# Hyperparameter optimization with Ray

## Libraries

In [1]:
# essentials
import os
import pathlib

import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB

# others
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb

# ray imports
import joblib
import ray
from ray.util.joblib import register_ray

RANDOM_SEED = 64

2023-12-15 14:20:03,096	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.8.0 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Setup

In [2]:
IN_KAGGLE = False
kaggle_folder = "/kaggle/input/playground-series-s3e26"
local_folder = "./data"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "/test.csv", index_col="id")

target_column = "Status"

In [3]:
def feature_engineering(df):
    #train_df['Status'] = train_df['Status'].map({"D": 0,"C": 1,"CL": 2})
    df['date_of_diagnosis'] = df['Age'] - df['N_Days']
    
    df['no_diseases'] = (df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema']) == 0
    df['diseases'] = df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema']
    df['Drug'] = df['Drug'].map({"D-penicillamine": 1,"placebo": 0})

    # change "Stage" to string
    df["Stage"] = df["Stage"].apply(lambda x: str(x))
    return df

train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "/train.csv", index_col="id")

run_feature_engineering = True

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

if run_feature_engineering:
    train_df = feature_engineering(train_df)

if run_feature_engineering:
    categorical_features += ["no_diseases", "diseases"]
    numerical_features += ["date_of_diagnosis"]

for col in categorical_features:
    train_df[col] = train_df[col].astype("category")

X = train_df.drop(columns=target_column)
y = train_df[target_column]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y, shuffle=True)

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

numeric_transformer = Pipeline(
    [
        ("power_transformer", PowerTransformer()),
        ("scaler", MaxAbsScaler()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

## Parameter space

In [4]:
models = {
    "xgboost": xgb.XGBClassifier(objective="multi:softprob", num_class=len(le.classes_), random_state=RANDOM_SEED, n_jobs=4),
    "catboost": cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False),
    #"lightgbm": lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=-1, num_threads=8, force_row_wise=True, early_stopping_round=5),
    "hist_gradient_boosting": HistGradientBoostingClassifier(random_state=RANDOM_SEED),
    "gradient_boosting": GradientBoostingClassifier(random_state=RANDOM_SEED),
    "knn": KNeighborsClassifier(n_jobs=4),

}

naive_bayes_models = {
    #"naive_bayes": GaussianNB(),
    "complement_naive_bayes": ComplementNB(),
    "complement_naive_bayes_bagging": BaggingClassifier(
        base_estimator=ComplementNB(alpha=0.9), 
        random_state=RANDOM_SEED, 
        n_jobs=4
    )
}

param_space = {
    "xgboost": {
        "classifier__eta": np.arange(0.01, 0.2, 0.01),
        "classifier__min_child_weight": np.arange(1, 10, 1),
        "classifier__max_depth": np.arange(3, 10, 1),
        "classifier__gamma": np.arange(0, 1, 0.1),
        "classifier__subsample": np.arange(0.5, 1, 0.1),
        "classifier__colsample_bytree": np.arange(0.5, 1, 0.1),
        "classifier__lambda": np.arange(0, 1, 0.1),
        "classifier__alpha": np.arange(0, 1, 0.1),
    },
    "catboost": {
        "classifier__learning_rate": np.arange(0.01, 0.2, 0.01),
        "classifier__depth": np.arange(3, 10, 1),
        "classifier__l2_leaf_reg": np.arange(1, 10, 1),
        "classifier__border_count": np.arange(1, 10, 1),
        "classifier__min_data_in_leaf": np.arange(1, 10, 1),
    },
    "lightgbm": {
        "classifier__num_leaves": np.arange(50, 100, 1),
        "classifier__max_depth": np.arange(3, 10, 1),
        "classifier__min_data_in_leaf": np.arange(50, 1000, 10),
    },
    "hist_gradient_boosting": {
        "classifier__learning_rate": np.arange(0.01, 0.2, 0.01),
        "classifier__max_depth": np.arange(3, 10, 1),
        "classifier__min_samples_leaf": np.arange(1, 10, 1),
        "classifier__max_leaf_nodes": np.arange(2, 10, 1),
    },
    "gradient_boosting": {
        "classifier__learning_rate": np.arange(0.01, 0.2, 0.01),
        "classifier__max_depth": np.arange(3, 10, 1),
        "classifier__min_samples_leaf": np.arange(1, 10, 1),
        "classifier__max_leaf_nodes": np.arange(2, 10, 1),
    },
    "naive_bayes": {
        "classifier__var_smoothing": np.arange(1e-9, 1e-8, 1e-9),
    },
    "complement_naive_bayes": {
        "classifier__alpha": np.arange(0.8, 1.0, 0.01),
    },
    "complement_naive_bayes_bagging": {
        "classifier__n_estimators": np.arange(10, 200, 10),
        "classifier__max_samples": np.arange(0.1, 1, 0.1),
    },
    "knn": {
        "classifier__n_neighbors": np.arange(15, 30, 1),
        "classifier__weights": ["uniform", "distance"],
        "classifier__leaf_size": np.arange(1, 10, 1),
        "classifier__p": np.arange(1, 10, 1),
    },
}

output_params = {}
output_scores = {}
output_best_esimators = {}

In [5]:
register_ray()

for model_name, model in models.items():
    clf = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", model),
        ]
    )
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

    model_param_space = param_space[model_name]
    search = RandomizedSearchCV(clf, model_param_space, cv=skf, n_iter=150, verbose=10, scoring="neg_log_loss")


    with joblib.parallel_backend('ray'):
        search.fit(X_train, y_train)

    output_params[model_name] = search.best_params_
    output_scores[model_name] = search.best_score_
    output_best_esimators[model_name] = search.best_estimator_

2023-12-15 14:20:03,322	INFO ray_backend.py:74 -- Starting local ray cluster


2023-12-15 14:20:05,373	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Fitting 3 folds for each of 150 candidates, totalling 450 fits
[36m(PoolActor pid=6323)[0m [CV 3/3; 4/150] START classifier__alpha=0.2, classifier__colsample_bytree=0.5, classifier__eta=0.04, classifier__gamma=0.5, classifier__lambda=0.4, classifier__max_depth=6, classifier__min_child_weight=9, classifier__subsample=0.8999999999999999
[36m(PoolActor pid=6300)[0m [CV 1/3; 2/150] END classifier__alpha=0.9, classifier__colsample_bytree=0.7, classifier__eta=0.09999999999999999, classifier__gamma=0.9, classifier__lambda=0.5, classifier__max_depth=3, classifier__min_child_weight=2, classifier__subsample=0.8999999999999999;, score=-0.441 total time=   2.0s
[36m(PoolActor pid=6309)[0m [CV 2/3; 13/150] START classifier__alpha=0.1, classifier__colsample_bytree=0.8999999999999999, classifier__eta=0.03, classifier__gamma=0.8, classifier__lambda=0.0, classifier__max_depth=4, classifier__min_child_weight=3, classifier__subsample=0.5[32m [repeated 29x across cluster] (Ray deduplicates logs by 

In [6]:
output_log_loss = {}
for model_name, model in output_best_esimators.items():

    y_pred_proba = model.predict_proba(X_val)
    y_pred = model.predict(X_val)

    log_loss_score = log_loss(y_val, y_pred_proba)

    output_log_loss[model_name] = log_loss_score

In [7]:

data = []
for model_name in output_params.keys():
    data.append({
        "model": model_name, 
        "random_search_score": output_scores[model_name], 
        "log_loss_score": output_log_loss[model_name],
        "params": str(output_params[model_name])
    })
output_df = pd.DataFrame(data)
#output_df = output_df.sort_values(by="log_loss_score", ascending=True)
#output_df.to_csv("great_hyperparam_search_result.csv", index=False)
output_df

Unnamed: 0,model,random_search_score,log_loss_score,params
0,xgboost,-0.446814,0.428005,"{'classifier__subsample': 0.8999999999999999, ..."
1,catboost,-0.465396,0.444625,"{'classifier__min_data_in_leaf': 1, 'classifie..."
2,hist_gradient_boosting,-0.453935,0.434512,"{'classifier__min_samples_leaf': 9, 'classifie..."
3,gradient_boosting,-0.453359,0.431684,"{'classifier__min_samples_leaf': 7, 'classifie..."
4,knn,-0.86843,0.86133,"{'classifier__weights': 'uniform', 'classifier..."


[33m(raylet)[0m [2023-12-15 16:25:43,909 E 5760 5800] (raylet) agent_manager.cc:70: The raylet exited immediately because one Ray agent failed, agent_name = dashboard_agent/424238335.
[33m(raylet)[0m The raylet fate shares with the agent. This can happen because
[33m(raylet)[0m - The version of `grpcio` doesn't follow Ray's requirement. Agent can segfault with the incorrect `grpcio` version. Check the grpcio version `pip freeze | grep grpcio`.
[33m(raylet)[0m - The agent failed to start because of unexpected error or port conflict. Read the log `cat /tmp/ray/session_latest/logs/{dashboard_agent|runtime_env_agent}.log`. You can find the log file structure here https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure.
[33m(raylet)[0m - The agent is killed by the OS (e.g., out of memory).
