Optuna uses bayesian optimization to converge to minimum function -> smarter than e.g. gridsearch or randomsearch

### Parameters

In [7]:
data_path = r".\Data\\"
submiss_path = r".\Prediction\\"
optuna_path = r".\Optuna_studies"
#hallo tijl

### Imports

In [8]:
# Algemeen
import pandas as pd
import numpy as np
import math

# Model
from sklearn.ensemble import GradientBoostingRegressor

# Hyper-parameter optimizing
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# show optuna results
import plotly

# save studies
import joblib

# Metric to minimize
from sklearn.metrics import mean_squared_error

# Read Data in DataFrame

In [9]:
# Training
train_set = pd.read_csv(data_path+"train.csv")
train_y = train_set['target']
train_X = train_set.drop(labels = ['target','id'], axis = 1)

# Testing
test_set = pd.read_csv(data_path+"test.csv")
test_X = test_set.drop(labels = 'id', axis = 1)

Optional: remove outliers

In [10]:
def remove_outliers(df):
    try:
        for column_name in df.columns:
            Q1 = df[column_name].quantile([0.25]).values
            Q3 = df[column_name].quantile([0.75]).values
            IQR = Q3-Q1
            df[column_name][(df[column_name]< (Q1-1.5*IQR)[0]) | (df[column_name]> (Q3+1.5*IQR)[0])] = df[column_name].median()
    except: 
        Q1 = df.quantile([0.25]).values
        Q3 = df.quantile([0.75]).values
        IQR = Q3-Q1
        df[(df< (Q1-1.5*IQR)[0]) | (df> (Q3+1.5*IQR)[0])] = df.median()
    return df


train_X = remove_outliers(train_X)
test_X = remove_outliers(test_X)
train_y = remove_outliers(train_y)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Optuna Hyperparameter Optimization

## Step 1: make objective function

In [11]:
def scorer(estimator, X, y):
    return np.sqrt(mean_squared_error(estimator.predict(X),y))

In [27]:
# name to save study
study_name = 'GBR_cv'

# accessory objective function to minimize
def objective_cv(trial, train_X = train_X, train_y = train_y):

    # Define search space
    learning_rate = trial.suggest_loguniform('learning_rate',1e-3,3)
    n_estimators = trial.suggest_int('n_estimators', 2,25)
    max_depth = trial.suggest_int('max_depth', 2,25)

    # make model and perform cross-validation
    model = GradientBoostingRegressor(random_state=34, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
    scores = cross_val_score(model, train_X, train_y, cv=5, scoring=scorer)
    return np.mean(scores)

In [13]:
# name to save study
study_name = 'GBR'

# accessory objective function to minimize
def objective(trial, train_X = train_X, train_y = train_y):
    X_train, X_eval, y_train, y_eval = train_test_split(train_X, train_y, test_size=0.20, random_state=34)
    # Define search space
    learning_rate = trial.suggest_loguniform('learning_rate',1e-3,0.5)
    n_estimators = trial.suggest_int('n_estimators',3,25)
    max_depth = trial.suggest_int('max_depth', 2,15)

    # make model and perform cross-validation
    model = GradientBoostingRegressor(random_state=34, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train,y_train)
    return scorer(model, X_eval, y_eval)

## Step 2: perform study

In [14]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
print('Best trial:', study.best_trial.params)

[32m[I 2022-06-15 13:07:29,459][0m A new study created in memory with name: no-name-cd3a6167-ee68-4625-96b8-fb5b12552208[0m
[32m[I 2022-06-15 13:09:02,059][0m Trial 0 finished with value: 0.7234319235435726 and parameters: {'learning_rate': 0.007609252841541411, 'n_estimators': 22, 'max_depth': 10}. Best is trial 0 with value: 0.7234319235435726.[0m
[32m[I 2022-06-15 13:09:21,935][0m Trial 1 finished with value: 0.715822825298018 and parameters: {'learning_rate': 0.10462973186305147, 'n_estimators': 5, 'max_depth': 10}. Best is trial 1 with value: 0.715822825298018.[0m
[32m[I 2022-06-15 13:11:13,092][0m Trial 2 finished with value: 0.7457996837448811 and parameters: {'learning_rate': 0.4663666486729657, 'n_estimators': 21, 'max_depth': 13}. Best is trial 1 with value: 0.715822825298018.[0m
[32m[I 2022-06-15 13:13:13,604][0m Trial 3 finished with value: 0.7194333775035693 and parameters: {'learning_rate': 0.2202834408103476, 'n_estimators': 23, 'max_depth': 14}. Best is tr

Best trial: {'learning_rate': 0.04486948767349003, 'n_estimators': 16, 'max_depth': 13}


## Step 3: Visualize learning

In [15]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [16]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['n_estimators','max_depth'])

In [17]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [18]:
# Visualize empirical distribution function; The cumulative probability at any point on the line represents the percentage of trials whose objective value is less than the objective value at that point.
optuna.visualization.plot_edf(study, target_name="RMSE on validation set")

## Step 4: Save study

In [19]:
joblib.dump(study, f'{optuna_path}\{study_name}.pkl')

['.\\Optuna_studies\\GBR.pkl']

## Reload study

In [5]:
study = joblib.load(f'{optuna_path}\{study_name}.pkl')
print(f"Best trial of {study_name} until now:")
print(" Value: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

Best trial of GBR until now:
 Value:  0.7092132405362954
 Params: 
    learning_rate: 0.2777194485412954
    n_estimators: 6
    max_depth: 11
