# Import

In [43]:
# Basic
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.preprocessing import *
from robusta.optimizer import *
from robusta.pipeline import *
from robusta.crossval import *
from sklearn.metrics import *

# Model
from lightgbm import LGBMRegressor

# Data

In [64]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=666)

X = pd.DataFrame(X)
y = pd.Series(y)

X.rename(columns=lambda x: 'x{}'.format(x), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
X_train

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
29,-3.135568,-0.682559,-0.740026,-1.565277,0.653032,-0.910702,-1.571373,-0.106283,-0.794989,0.250017
535,-0.220983,1.681607,1.541841,-0.190491,1.520588,0.938335,1.503033,-0.538354,0.116053,0.690656
695,-0.106349,-0.442650,1.933125,-0.034856,0.534739,-0.296531,0.372608,-0.541009,-0.653288,0.318019
557,-0.523181,-0.478655,0.176158,-0.245387,1.144202,0.284027,1.830700,0.202389,-0.491186,0.489575
836,2.244197,1.225454,-0.828965,-0.111927,0.375417,0.444073,-0.835202,-0.458208,0.612965,-0.562706
...,...,...,...,...,...,...,...,...,...,...
106,0.394877,-1.394822,0.255025,0.818061,-0.053974,0.193075,-0.785655,0.108597,0.451189,-0.765413
270,-0.008923,-0.601086,-0.054966,0.985366,-0.052347,-0.303155,0.076224,-0.498821,-0.466354,-1.514645
860,-1.305500,3.159843,-0.985534,-1.357325,-0.480135,0.096085,-1.149118,0.994904,-0.804308,-1.595492
435,0.470365,-1.164993,-0.227289,-1.143914,2.274234,-0.661404,-2.009812,0.882182,-1.717222,0.064218


# Task

In [66]:
get_score = lambda y_true, y_pred: -mean_squared_error(y_true, y_pred)
scoring = 'neg_mean_squared_error'

# Baseline

In [67]:
model = LGBMRegressor()

In [68]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[05:32:59]  LGBMRegressor

[05:32:59]  FOLD  0:   -1849.9203
[05:32:59]  FOLD  1:   -985.8744
[05:32:59]  FOLD  2:   -1560.7978
[05:32:59]  FOLD  3:   -1821.7524
[05:32:59]  FOLD  4:   -1409.5014

[05:33:00]  AVERAGE:   -1525.5693 ± 315.8833



-1294.5736816744054

# Optimizer

## Grid Search

## Random Search

In [None]:
model = LGBMRegressor()

# https://lightgbm.readthedocs.io/en/latest/Parameters.html
param_space = {
    'max_depth': (3, 12, 1),
    'num_leaves': {15, 31, 63, 127, 255, 511, 1023, 2047, 4095},

    'bagging_fraction': (0.1, 0.9, 0.05),
    'feature_fraction': (0.1, 0.9, 0.05),
}

optimizer = RandomSearchCV(model, cv, scoring, param_space=param_space, max_iter=50)
optimizer.fit(X_train, y_train)

model = optimizer.best_estimator_

[05:33:09] ITER: 1/50      SCORE: -1510.0740 ± 228.2347      ETA: 16 sec
[05:33:11] ITER: 2/50      SCORE: -1823.8668 ± 356.8841      ETA: 34 sec
[05:33:11] ITER: 3/50      SCORE: -1809.7237 ± 336.2711      ETA: 27 sec
[05:33:12] ITER: 4/50      SCORE: -1508.8063 ± 257.6029      ETA: 24 sec
[05:33:12] ITER: 5/50      SCORE: -7889.4019 ± 729.3220      ETA: 21 sec
[05:33:13] ITER: 6/50      SCORE: -1301.2092 ± 232.7649      ETA: 20 sec
[05:33:14] ITER: 7/50      SCORE: -1468.2790 ± 290.5045      ETA: 22 sec
[05:33:15] ITER: 8/50      SCORE: -1427.0582 ± 256.7935      ETA: 24 sec
[05:33:15] ITER: 9/50      SCORE: -1823.8668 ± 356.8841      ETA: 22 sec
[05:33:16] ITER: 10/50      SCORE: -1427.0582 ± 256.7935      ETA: 21 sec
[05:33:16] ITER: 11/50      SCORE: -1366.7639 ± 272.2196      ETA: 19 sec
[05:33:17] ITER: 12/50      SCORE: -1358.4104 ± 222.0262      ETA: 19 sec
[05:33:18] ITER: 13/50      SCORE: -8131.6719 ± 735.3454      ETA: 19 sec


In [None]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, verbose=2, n_jobs=None)

get_score(y_test, y_pred)

## Optuna

In [57]:
model = LGBMRegressor()

# https://lightgbm.readthedocs.io/en/latest/Parameters.html
param_space = {
    'max_depth': (3, 12, 1),
    'num_leaves': {15, 31, 63, 127, 255, 511, 1023, 2047, 4095},

    'bagging_fraction': (0.1, 0.9, 0.05),
    'feature_fraction': (0.1, 0.9, 0.05),
}

optimizer = OptunaCV(model, cv, scoring, param_space=param_space, max_iter=50)
optimizer.fit(X_train, y_train)

model = optimizer.best_estimator_

[05:31:24] ITER: 1/20      SCORE: -1510.0740 ± 228.2347      ETA: 6 sec
[05:31:25] ITER: 2/20      SCORE: -1823.8668 ± 356.8841      ETA: 11 sec
[05:31:25] ITER: 3/20      SCORE: -1809.7237 ± 336.2711      ETA: 9 sec
[05:31:26] ITER: 4/20      SCORE: -1508.8063 ± 257.6029      ETA: 7 sec
[05:31:26] ITER: 5/20      SCORE: -7889.4019 ± 729.3220      ETA: 6 sec
[05:31:27] ITER: 6/20      SCORE: -1301.2092 ± 232.7649      ETA: 6 sec
[05:31:28] ITER: 7/20      SCORE: -1468.2790 ± 290.5045      ETA: 6 sec
[05:31:29] ITER: 8/20      SCORE: -1427.0582 ± 256.7935      ETA: 6 sec
[05:31:30] ITER: 9/20      SCORE: -1823.8668 ± 356.8841      ETA: 5 sec
[05:31:30] ITER: 10/20      SCORE: -1427.0582 ± 256.7935      ETA: 5 sec
[05:31:31] ITER: 11/20      SCORE: -1300.4823 ± 236.3293      ETA: 4 sec
[05:31:31] ITER: 12/20      SCORE: -1311.3966 ± 247.2201      ETA: 3 sec
[05:31:32] ITER: 13/20      SCORE: -1311.3966 ± 247.2201      ETA: 3 sec
[05:31:32] ITER: 14/20      SCORE: -1295.0819 ± 248.8429   

In [58]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[05:31:39]  LGBMRegressor

[05:31:39]  FOLD  0:   -1512.9140
[05:31:39]  FOLD  1:   -837.6900
[05:31:39]  FOLD  2:   -1420.9950
[05:31:39]  FOLD  3:   -1498.2436
[05:31:39]  FOLD  4:   -1188.1824

[05:31:39]  AVERAGE:   -1291.6050 ± 255.0114



-1027.4699800600881