# LightGBM & Optuna

In [1]:
%load_ext watermark
%watermark -p scikit-learn,optuna

scikit-learn: 0.24.1
optuna      : 2.10.0



## Dataset

In [2]:
import pandas as pd


X_train = pd.read_csv('pcaX_train.csv').values
y_train = pd.read_csv('pcay_train.csv').values.ravel().astype(int)

X_test = pd.read_csv('pcaX_test.csv').values
y_test = pd.read_csv('pcay_test.csv').values.ravel().astype(int)

print('X_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)


X_train.shape: (18592, 49)
y_train.shape: (18592,)
X_test.shape: (4649, 49)
y_test.shape: (4649,)


In [3]:
from sklearn.model_selection import train_test_split
X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0])

Train/Valid/Test sizes: 18592 3719


## My Model

### LightGBM & Optuna

In [4]:
# !pip install optuna
# !pip install lightgbm

In [5]:
import numpy as np
import optuna
from optuna.integration import LightGBMPruningCallback

import lightgbm

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
#optuna.logging.set_verbosity(optuna.logging.WARNING)


def objective(trial, X_train, y_train, cv=5):
    
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [500, 1000, 2000]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.01]),
    }
    
    cv_iterator = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)

    cv_scores = np.zeros(cv)
    for idx, (train_sub_idx, valid_idx) in enumerate(cv_iterator.split(X_train, y_train)):
        
        X_train_sub, X_valid = X_train[train_sub_idx], X_train[valid_idx]
        y_train_sub, y_valid = y_train[train_sub_idx], y_train[valid_idx]

        model = lightgbm.LGBMClassifier(objective="multi_logloss", **param_grid)
        model.fit(
            X_train_sub,
            y_train_sub,
            eval_set=[(X_valid, y_valid)],
            eval_metric="multi_logloss",
            verbose=-1,
            early_stopping_rounds=50,
            callbacks=[
                LightGBMPruningCallback(trial=trial, metric="multi_logloss")
            ],  # Add a pruning callback to eliminate unpromising candidates
        )
        preds = model.score(X_valid, y_valid)
        
        cv_scores[idx] = preds

    return 1-np.mean(cv_scores)

In [6]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")

def func(trial):
    return objective(trial, X_train, y_train)

study.optimize(func, n_trials=10);

[32m[I 2021-12-06 20:41:25,387][0m A new study created in memory with name: LGBM Classifier[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1077]	valid_0's multi_logloss: 0.605442
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1007]	valid_0's multi_logloss: 0.600066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1004]	valid_0's multi_logloss: 0.628394
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:42:29,745][0m Trial 0 finished with value: 0.26602882917649084 and parameters: {'n_estimators': 2000, 'learning_rate': 0.01}. Best is trial 0 with value: 0.26602882917649084.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.619924
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.626794
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.61693
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.641451
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.643117


[32m[I 2021-12-06 20:43:00,605][0m Trial 1 finished with value: 0.2800133388856578 and parameters: {'n_estimators': 500, 'learning_rate': 0.01}. Best is trial 0 with value: 0.26602882917649084.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1077]	valid_0's multi_logloss: 0.605442
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1007]	valid_0's multi_logloss: 0.600066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1004]	valid_0's multi_logloss: 0.628394
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:44:03,136][0m Trial 2 finished with value: 0.26602882917649084 and parameters: {'n_estimators': 2000, 'learning_rate': 0.01}. Best is trial 0 with value: 0.26602882917649084.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.619924
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.626794
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.61693
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.641451
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.643117


[32m[I 2021-12-06 20:44:33,724][0m Trial 3 finished with value: 0.2800133388856578 and parameters: {'n_estimators': 500, 'learning_rate': 0.01}. Best is trial 0 with value: 0.26602882917649084.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1077]	valid_0's multi_logloss: 0.605442
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1007]	valid_0's multi_logloss: 0.600066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1004]	valid_0's multi_logloss: 0.628394
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:45:39,060][0m Trial 4 finished with value: 0.26602882917649084 and parameters: {'n_estimators': 2000, 'learning_rate': 0.01}. Best is trial 0 with value: 0.26602882917649084.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1077]	valid_0's multi_logloss: 0.605442
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1007]	valid_0's multi_logloss: 0.600066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1004]	valid_0's multi_logloss: 0.628394
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:46:48,904][0m Trial 5 finished with value: 0.26602882917649084 and parameters: {'n_estimators': 2000, 'learning_rate': 0.01}. Best is trial 0 with value: 0.26602882917649084.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.605976
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.600101
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.628504
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:47:53,246][0m Trial 6 finished with value: 0.2659750512792066 and parameters: {'n_estimators': 1000, 'learning_rate': 0.01}. Best is trial 6 with value: 0.2659750512792066.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1077]	valid_0's multi_logloss: 0.605442
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1007]	valid_0's multi_logloss: 0.600066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1004]	valid_0's multi_logloss: 0.628394
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:49:01,351][0m Trial 7 finished with value: 0.26602882917649084 and parameters: {'n_estimators': 2000, 'learning_rate': 0.01}. Best is trial 6 with value: 0.2659750512792066.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.605976
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.600101
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.628504
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:50:09,268][0m Trial 8 finished with value: 0.2659750512792066 and parameters: {'n_estimators': 1000, 'learning_rate': 0.01}. Best is trial 6 with value: 0.2659750512792066.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1077]	valid_0's multi_logloss: 0.605442
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	valid_0's multi_logloss: 0.612969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1007]	valid_0's multi_logloss: 0.600066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1004]	valid_0's multi_logloss: 0.628394
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's multi_logloss: 0.632207


[32m[I 2021-12-06 20:51:26,738][0m Trial 9 finished with value: 0.26602882917649084 and parameters: {'n_estimators': 2000, 'learning_rate': 0.01}. Best is trial 6 with value: 0.2659750512792066.[0m


In [7]:
print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value: 0.26598
	Best params:
		n_estimators: 1000
		learning_rate: 0.01


In [8]:
model = lightgbm.LGBMClassifier(objective="multi_logloss", **study.best_params)
model.fit(X_train, y_train)

LGBMClassifier(learning_rate=0.01, n_estimators=1000, objective='multi_logloss')

In [9]:
print(f"Training Accuracy: {model.score(X_train, y_train):0.3f}")
print(f"Test Accuracy: {model.score(X_test, y_test):0.5f}")

Training Accuracy: 0.865
Test Accuracy: 0.74597


In [10]:
%timeit model.fit(X_train, y_train) #training(fitting) time
%timeit model.score(X_valid, y_valid) #test time

12 s ± 890 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
928 ms ± 85.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
