Optuna uses bayesian optimization to converge to minimum function -> smarter than e.g. gridsearch or randomsearch

### Parameters

In [1]:
data_path = r".\Data\\"
submiss_path = r".\Prediction\\"
optuna_path = r".\Optuna_studies"
#hallo tijl

### Imports

In [10]:
# Algemeen
import pandas as pd
import numpy as np
import math

# Model
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# Hyper-parameter optimizing
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# show optuna results
import plotly

# save studies
import joblib

# Metric to minimize
from sklearn.metrics import mean_squared_error

# Read Data in DataFrame

In [27]:
# Training
train_set = pd.read_csv(data_path+"train.csv")
train_y = train_set['target']
train_X = train_set.drop(labels = ['target','id'], axis = 1)

# Testing
test_set = pd.read_csv(data_path+"test.csv")
test_X = test_set.drop(labels = 'id', axis = 1)

Optional: remove outliers

In [28]:
def remove_outliers(df):
    try:
        for column_name in df.columns:
            Q1 = df[column_name].quantile([0.25]).values
            Q3 = df[column_name].quantile([0.75]).values
            IQR = Q3-Q1
            df[column_name][(df[column_name]< (Q1-1.5*IQR)[0]) | (df[column_name]> (Q3+1.5*IQR)[0])] = df[column_name].median()
    except: 
        Q1 = df.quantile([0.25]).values
        Q3 = df.quantile([0.75]).values
        IQR = Q3-Q1
        df[(df< (Q1-1.5*IQR)[0]) | (df> (Q3+1.5*IQR)[0])] = df.median()
    return df


train_X = remove_outliers(train_X)
test_X = remove_outliers(test_X)
train_y = remove_outliers(train_y)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Optuna Hyperparameter Optimization

## Step 1: make objective function

In [5]:
def scorer(estimator, X, y):
    return np.sqrt(mean_squared_error(estimator.predict(X),y))

In [46]:
# name to save study
study_name = 'GBR_cv'

# accessory objective function to minimize
def objective_GBR_cv(trial, train_X = train_X, train_y = train_y):

    # Define search space
    learning_rate = trial.suggest_loguniform('learning_rate',1e-3,3)
    n_estimators = trial.suggest_int('n_estimators', 2,25)
    max_depth = trial.suggest_int('max_depth', 2,25)

    # make model and perform cross-validation
    model = GradientBoostingRegressor(random_state=34, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
    scores = cross_val_score(model, train_X, train_y, cv=5, scoring=scorer)
    return np.mean(scores)

In [47]:
# name to save study
study_name = 'GBR'

# accessory objective function to minimize
def objective_GBR(trial, train_X = train_X, train_y = train_y):
    X_train, X_eval, y_train, y_eval = train_test_split(train_X, train_y, test_size=0.20, random_state=34)
    # Define search space
    learning_rate = trial.suggest_loguniform('learning_rate',1e-3,0.5)
    n_estimators = trial.suggest_int('n_estimators',3,25)
    max_depth = trial.suggest_int('max_depth', 2,15)

    # make model and perform cross-validation
    model = GradientBoostingRegressor(random_state=34, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train,y_train)
    return scorer(model, X_eval, y_eval)

In [7]:
# name to save study
study_name = 'xGBR_cv'

# accessory objective function to minimize
def objective_xGBR_cv(trial, train_X = train_X, train_y = train_y):

    # Define search space
    learning_rate = trial.suggest_loguniform('learning_rate',1e-3,1e-1)
    n_estimators = trial.suggest_int('n_estimators', 7000,10000)
    max_depth = trial.suggest_int('max_depth', 2,25)
    alpha = trial.suggest_loguniform('alpha', 1e-3,1e-1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree',0.1,1)
    subsample = trial.suggest_uniform('subsample',0.1,1)
    min_child_weight = trial.suggest_int('min_child_weight',200,450)


    # make model and perform cross-validation
    model = XGBRegressor(random_state=24, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, alpha = alpha, colsample_bytree = colsample_bytree, subsample = subsample, min_child_weight = min_child_weight)
    scores = cross_val_score(model, train_X, train_y, cv=5, scoring=scorer)
    return np.mean(scores)

In [31]:
# name to save study
study_name = 'xGBR'

# accessory objective function to minimize
def objective_xGBR(trial, train_X = train_X, train_y = train_y):
    X_train, X_eval, y_train, y_eval = train_test_split(train_X, train_y, test_size=0.20, random_state=34)

    # Define search space
    learning_rate = trial.suggest_loguniform('learning_rate',1e-3,1e-1)
    n_estimators = trial.suggest_int('n_estimators', 7000,10000)
    max_depth = trial.suggest_int('max_depth', 2,25)
    alpha = trial.suggest_loguniform('alpha', 1e-3,1e-1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree',0.1,1)
    subsample = trial.suggest_uniform('subsample',0.1,1)
    min_child_weight = trial.suggest_int('min_child_weight',200,450)

    # make model and perform cross-validation
    model = XGBRegressor(random_state=24, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, alpha = alpha, colsample_bytree = colsample_bytree, subsample = subsample, min_child_weight = min_child_weight)
    model.fit(X_train,y_train)
    return scorer(model, X_eval, y_eval)

## Step 2: perform study

In [32]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_xGBR, n_trials=8)
print('Best trial:', study.best_trial.params)

[32m[I 2022-06-16 17:11:14,094][0m A new study created in memory with name: no-name-c3c7a2ea-134c-4b5f-a235-773f397186d2[0m
[32m[I 2022-06-16 17:46:10,848][0m Trial 0 finished with value: 0.6959769515063051 and parameters: {'learning_rate': 0.014118594558791367, 'n_estimators': 9202, 'max_depth': 8, 'alpha': 0.011391849063115828, 'colsample_bytree': 0.4744087145615321, 'subsample': 0.8249941207796297, 'min_child_weight': 380}. Best is trial 0 with value: 0.6959769515063051.[0m
[32m[I 2022-06-16 17:54:28,948][0m Trial 1 finished with value: 0.7153138420848728 and parameters: {'learning_rate': 0.06629908218213718, 'n_estimators': 7010, 'max_depth': 7, 'alpha': 0.09657598909879818, 'colsample_bytree': 0.14194448204173135, 'subsample': 0.272279522147122, 'min_child_weight': 404}. Best is trial 0 with value: 0.6959769515063051.[0m
[32m[I 2022-06-16 18:36:38,972][0m Trial 2 finished with value: 0.7161625997595008 and parameters: {'learning_rate': 0.032432426472295856, 'n_estimator

Best trial: {'learning_rate': 0.0010699215607563035, 'n_estimators': 9677, 'max_depth': 21, 'alpha': 0.005907921495764525, 'colsample_bytree': 0.32413252024445605, 'subsample': 0.6179181769708401, 'min_child_weight': 428}


## Step 3: Visualize learning

In [38]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [39]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['n_estimators','learning_rate'])

In [40]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [41]:
# Visualize empirical distribution function; The cumulative probability at any point on the line represents the percentage of trials whose objective value is less than the objective value at that point.
optuna.visualization.plot_edf(study, target_name="RMSE on validation set")

## Step 4: Save study

In [43]:
joblib.dump(study, f'{optuna_path}\{study_name}.pkl')

['.\\Optuna_studies\\xGBR.pkl']

## Reload study

In [45]:
study = joblib.load(f'{optuna_path}\{study_name}.pkl')
print(f"Best trial of {study_name} until now:")
print(" Value: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

Best trial of xGBR until now:
 Value:  0.6944172918071159
 Params: 
    learning_rate: 0.0010699215607563035
    n_estimators: 9677
    max_depth: 21
    alpha: 0.005907921495764525
    colsample_bytree: 0.32413252024445605
    subsample: 0.6179181769708401
    min_child_weight: 428


## make best model

In [23]:
model = XGBRegressor(random_state=24, learning_rate=study.best_trial.params['learning_rate'], n_estimators=study.best_trial.params['n_estimators'], max_depth=study.best_trial.params['max_depth'], alpha = study.best_trial.params['lambda'], colsample_bytree = study.best_trial.params['colsample_bytree'], subsample = study.best_trial.params['subsample'], min_child_weight = study.best_trial.params['min_child_weight'])
model.fit(train_X,train_y)

XGBRegressor(alpha=0.002624162988508127, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.17966030790915047, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.020422363908538667,
             max_delta_step=0, max_depth=23, min_child_weight=383, missing=nan,
             monotone_constraints='()', n_estimators=728, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=24,
             reg_alpha=0.00262416294, reg_lambda=1, scale_pos_weight=1,
             subsample=0.707006937285646, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [29]:
test_y = model.predict(test_X)#lol

In [25]:
def make_submission(test_y,test_set = test_set, naam = 'latest'):
    ingevuld_df = pd.DataFrame({'id': test_set['id'], 'target': test_y})
    ingevuld_df.to_csv(path_or_buf=submiss_path +'Submission_'+ naam + '.csv', sep=',',index = False)

In [30]:
make_submission(test_y, naam = 'xgb_optuna2')