# Optuna Pieline for model selection

## Kaggle ML2
## Matteo A. D'Alessandro, Carlo A. Patti

In [1]:
import optuna
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb

In [2]:
df = pd.read_csv('../../data/train_new_features.csv')
target = 'Cover_Type'
y = df[target]
X = df.drop(target, axis=1)

In [3]:
MODEL_NAME2MODEL = {
    "RandomForestClassifier": RandomForestClassifier,
    "LogisticRegression": LogisticRegression,
    "AdaBoostClassifier": AdaBoostClassifier,
    "GradientBoostingClassifier": GradientBoostingClassifier,
    "ExtraTreesClassifier": ExtraTreesClassifier
}

In [4]:
def evaluate_model(model, trial):
    skf = StratifiedKFold(n_splits=5)
    scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        score = accuracy_score(y_test, preds)
        scores.append(score)

    return np.mean(scores)

In [5]:
def objective(trial):
    model_name = trial.suggest_categorical("Model", list(MODEL_NAME2MODEL.keys()))
    ModelClass = MODEL_NAME2MODEL[model_name]

    if model_name == "RandomForestClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 200),
            "max_depth": trial.suggest_int("max_depth", 2, 32),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }
    elif model_name == "LogisticRegression":
        model_params = {
            "C": trial.suggest_float("C", 1e-4, 1e4, log=True),
            "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
        }
        model = make_pipeline(StandardScaler(), ModelClass(**model_params))
        return evaluate_model(model, trial)
    
    elif model_name == "AdaBoostClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0)
        }
    elif model_name == "GradientBoostingClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 32),
        }
    elif model_name == "ExtraTreesClassifier":
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "max_depth": trial.suggest_int("max_depth", 2, 32),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }
    else:
        model_params = {}

    model = ModelClass(**model_params)
    return evaluate_model(model, trial)

In [6]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_model_name = best_params["Model"]
print(f"Best model: {best_model_name}")
print("Best parameters:", best_params)

[I 2024-04-02 19:55:03,455] A new study created in memory with name: no-name-a3c77a79-6ab2-495f-9356-261165d335e4
[I 2024-04-02 20:08:57,107] Trial 0 finished with value: 0.827116402116402 and parameters: {'Model': 'GradientBoostingClassifier', 'n_estimators': 178, 'learning_rate': 0.09151353924214442, 'max_depth': 27}. Best is trial 0 with value: 0.827116402116402.
[I 2024-04-02 20:12:24,621] Trial 1 finished with value: 0.823015873015873 and parameters: {'Model': 'GradientBoostingClassifier', 'n_estimators': 64, 'learning_rate': 0.2850220523899318, 'max_depth': 30}. Best is trial 0 with value: 0.827116402116402.
[I 2024-04-02 20:16:44,683] Trial 2 finished with value: 0.8419973544973545 and parameters: {'Model': 'GradientBoostingClassifier', 'n_estimators': 98, 'learning_rate': 0.040083473707634085, 'max_depth': 7}. Best is trial 2 with value: 0.8419973544973545.
[I 2024-04-02 20:17:44,920] Trial 3 finished with value: 0.8097222222222221 and parameters: {'Model': 'GradientBoostingCla

Best model: GradientBoostingClassifier
Best parameters: {'Model': 'GradientBoostingClassifier', 'n_estimators': 98, 'learning_rate': 0.040083473707634085, 'max_depth': 7}


In [7]:
model = GradientBoostingClassifier(n_estimators=98, learning_rate=0.040083473707634085, max_depth=7, random_state=0)

model.fit(X, y)

In [8]:
df_test = pd.read_csv('../../data/test_new_features.csv')

In [14]:
df_test.drop('Id', axis=1, inplace=True)

In [15]:
df_test.shape[0]

581012

In [16]:
preds = model.predict(df_test)


In [17]:
# the id are from 1 to df_test.shape[0]

submission = pd.DataFrame({'Id': range(1, df_test.shape[0] + 1), 'Cover_Type': preds})

submission.to_csv('submission.csv', index=False)