# LightGBM with parameter tunning using Optuna

This notebook uses Optuna to tune LightGBM parameters. Here I use numerical encoding (instead of one-hot encoding) for the categorical variables.

# Load libraries and data

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor

import optuna
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
test = pd.read_csv(input_path / 'test.csv', index_col='id')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
target = train.pop('target')

# Encode categorical variables as integers

In [None]:
for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(train[c].values)
        test[c] = lbl.transform(test[c].values)

In [None]:
display(train.head())

# Preliminary tests to estimate ranges for the parameters

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.1, random_state=0)

In [None]:
# Base model

model = LGBMRegressor(random_state=0)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print('RMSE =', f'{rmse:0.5f}')

Initial parameter tuning with grid search. I ran each parameter individually to estimate bounds for the search with Optuna.

In [None]:
# Grid search

#grid = {
#    'metric': ['rmse'],
#    'random_state': [0],
#    'n_estimators': [200],
#    'learning_rate': [0.001, 0.01, 0.1, 1.0],
#    'reg_lambda': [0.001, 0.01, 0.1, 1.0, 10.0],
#    'reg_alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
#    'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5],
#    'min_child_samples': [10, 20, 40, 75, 100, 200, 300],
#    'max_depth': [5, 10, 25, 50, 100],
#    'num_leaves': [40, 50, 60, 80, 100],
#}

In [None]:
#scores = []
#for g in ParameterGrid(grid):
#    model = LGBMRegressor()
#    model.set_params(**g)
#    model.fit(X_train, y_train)
#    y_pred = model.predict(X_valid)
#    score = mean_squared_error(y_valid, y_pred, squared=False)
#    scores.append(score)
#    print('RMSE =', f'{score:0.5f} ', 'Parameters:', g)
#best_idx = np.argmin(scores)
#print('Best score: ', scores[best_idx], ParameterGrid(grid)[best_idx])

# Set objective function for Optuna with parameters and their ranges

In order to search for the best parameter values, I created the following function with the parameters I wanted to be tuned. To set the ranges, I did some preliminary tests varying them individually. Finally, I put everything together as shown below.

In [None]:
def objective(trial):
    params = {
        'metric': 'rmse',
        'random_state': 0,
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1, 0.2, 0.3, 0.4, 0.5]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'max_depth': trial.suggest_int('max_depth', 6, 127),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),
        'cat_feature': [x for x in range(10)],
        'cat_smooth': trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20)
        #'device':'gpu',        # comment this line if GPU is off
        #'gpu_platform_id': 0,  # comment this line if GPU is off
        #'gpu_device_id': 0,    # comment this line if GPU is off
    }
    model = LGBMRegressor(**params) 
    model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], early_stopping_rounds=1000, verbose=0)
    y_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    
    return rmse

In [None]:
%%time
study = optuna.create_study(direction='minimize',sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best parameters:', study.best_trial.params)
print('Best RMSE:', study.best_trial.value)

# Visualize optimization history

In [None]:
optuna.visualization.plot_optimization_history(study)

# Recover best parameters found and build final predictions

In [None]:
params = study.best_params
params['random_state'] = 0
params['n_estimators'] = 10000

In [None]:
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
y_pred = np.zeros(test.shape[0])

for fold, (train_index, valid_index) in enumerate(kf.split(train, target)):
    print("Running Fold {}".format(fold + 1))
    X_train, X_valid = pd.DataFrame(train.iloc[train_index]), pd.DataFrame(train.iloc[valid_index])
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    model = LGBMRegressor(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=1000,
        verbose=0,
    )
    y_pred += model.predict(test) / n_folds

print("Done!")

In [None]:
submission['target'] = y_pred
submission.to_csv('lgbm_optuna_num_enc.csv')

### Please feel free to add comments and suggestions. Thanks! 😊