# Optuna Pieline for model selection

## Kaggle ML2
## Matteo A. D'Alessandro, Carlo A. Patti

In [1]:
import optuna
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb

In [None]:
X = pd.read_csv('path_to_your_X_train.csv')
y = pd.read_csv('path_to_your_y_train.csv').values.ravel()

In [2]:
MODEL_NAME2MODEL = {
    "RandomForestClassifier": RandomForestClassifier,
    "LogisticRegression": LogisticRegression,
    "AdaBoostClassifier": AdaBoostClassifier,
    "GradientBoostingClassifier": GradientBoostingClassifier,
    "ExtraTreesClassifier": ExtraTreesClassifier,
    "XGBoostClassifier": xgb.XGBClassifier
}

In [3]:
def evaluate_model(model, trial):
    skf = StratifiedKFold(n_splits=5)
    scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        score = accuracy_score(y_test, preds)
        scores.append(score)

    return np.mean(scores)

In [4]:
def objective(trial):
    model_name = trial.suggest_categorical("Model", list(MODEL_NAME2MODEL.keys()))
    ModelClass = MODEL_NAME2MODEL[model_name]

    if model_name == "RandomForestClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 200),
            "max_depth": trial.suggest_int("max_depth", 2, 32),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }
    elif model_name == "LogisticRegression":
        model_params = {
            "C": trial.suggest_float("C", 1e-4, 1e4, log=True),
            "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
        }
        model = make_pipeline(StandardScaler(), ModelClass(**model_params))
        return evaluate_model(model, trial)
    
    elif model_name == "AdaBoostClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0)
        }
    elif model_name == "GradientBoostingClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 32),
        }
    elif model_name == "ExtraTreesClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "max_depth": trial.suggest_int("max_depth", 2, 32),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }
    elif model_name == "XGBoostClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }
    else:
        model_params = {}

    model = ModelClass(**model_params)
    return evaluate_model(model, trial)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_model_name = best_params["Model"]
print(f"Best model: {best_model_name}")
print("Best parameters:", best_params)