# XGBoost with parameter tunning using Optuna

This notebook implements Optuna to tune XGBoost parameters. A considerable part of the code was borrowed from [this notebook](https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna) and I also took advice from [this blog post](https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e) to select which parameters should be tuned.

# Load libraries and data

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

import optuna
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
test = pd.read_csv(input_path / 'test.csv', index_col='id')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
target = train.pop('target')

# Encode categorical variables with one-hot encoding (ohe)

From what I've read, XGBoost only works with categorical variables encoded in this manner.

In [None]:
# Concatenate train and test sets before encoding to guarantee that they will have the same columns
traintest = pd.concat([train, test])
cat_features = [f'cat{i}' for i in range(10)]
dummies = pd.get_dummies(traintest, columns=cat_features, drop_first=True)

# Create new train and test sets with one-hot encodings
train_ohe = dummies.iloc[:train.shape[0], :]
test_ohe = dummies.iloc[train.shape[0]:, :]

In [None]:
display(train_ohe.head())

# Preliminary tests to estimate ranges for the parameters

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_ohe, target, test_size=0.1, random_state=0)

In [None]:
# Base model
model = XGBRegressor(tree_method='gpu_hist')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print('RMSE =', f'{rmse:0.5f}')

These are some of the values I tried for each parameter.

In [None]:
# Grid search

#grid = {
#    'random_state': [0], 
#    'n_estimators': [100, 500, 1000, 2000, 5000, 10000],
#    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
#    'learning_rate': [0.001, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.60, 1.00],
#    'reg_lambda': [0.0, 0.01, 0.05, 0.10, 1.0, 10.0],
#    'reg_alpha': [0.0, 0.01, 0.05, 0.10, 1.0, 10.0],
#    'gamma': [0.0, 0.01, 0.05, 0.10, 1.0, 10.0],
#    'subsample': [0.8, 0.9, 1.0],
#    'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
#    'tree_method': ['gpu_hist'],
#}

In [None]:
#scores = []
#for g in ParameterGrid(grid):
#    model = XGBRegressor()
#    model.set_params(**g)
#    model.fit(X_train, y_train)
#    y_pred = model.predict(X_valid)
#    score = mean_squared_error(y_valid, y_pred, squared=False)
#    scores.append(score)
#    print('RMSE =', f'{score:0.5f}', 'Parameters:', g)
#best_idx = np.argmin(scores)
#print('Best score: ', scores[best_idx], ParameterGrid(grid)[best_idx])

# Set objective function for Optuna with parameters and their ranges

In order to search for the best parameter values, I created the following function with the parameters I wanted to be tuned. To set the ranges, I did some preliminary tests varying them individually. Finally, I put everything together as shown below.

In [None]:
def objective(trial):
    params = {
        'random_state': 0,
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 10),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.9, 1.0]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1, 0.2, 0.3, 0.4, 0.5]),
        'tree_method':'gpu_hist'    # comment this line if GPU is off
    }
    model = XGBRegressor(**params) 
    model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], early_stopping_rounds=1000, verbose=0)
    y_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    
    return rmse

In [None]:
%%time
study = optuna.create_study(direction='minimize',sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best parameters:', study.best_trial.params)
print('Best RMSE:', study.best_trial.value)

# Visualize optimization history

In [None]:
optuna.visualization.plot_optimization_history(study)

# Recover best parameters found and build final predictions

In [None]:
params = study.best_params
params['random_state'] = 0
params['n_estimators'] = 10000
params['tree_method'] = 'gpu_hist'

In [None]:
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
y_pred = np.zeros(test_ohe.shape[0])

for fold, (train_index, valid_index) in enumerate(kf.split(train_ohe, target)):
    print("Running Fold {}".format(fold + 1))
    X_train, X_valid = pd.DataFrame(train_ohe.iloc[train_index]), pd.DataFrame(train_ohe.iloc[valid_index])
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    model = XGBRegressor(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=1000,
        verbose=0,
    )
    y_pred += model.predict(test_ohe) / n_folds
    
print("Done!")

In [None]:
submission['target'] = y_pred
submission.to_csv('xgboost_optuna.csv')

### Please feel free to add comments and suggestions. Thanks! 😊