In [38]:
import pandas as pd
import optuna
import warnings
import numpy as np
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from lightgbm import LGBMRegressor

In [29]:
data_path = '..\\repo\\novosib-data-pipline\\data\\'

In [30]:
full_base = pd.read_feather(data_path + 'interim\\full_base.frt')

train_data = full_base.iloc[:, 4:]
inc_target = full_base.paid_avg_correct
cnt_target = full_base.transactions_count

In [14]:
def gb_avg_inc(trial, X=train_data, y=inc_target):

    gb_params = {
        'max_depth': trial.suggest_int('max_depth', 5, 12, step=1),
        'n_estimators': trial.suggest_int('n_estimators', 80, 180, step=5),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100, step=5),
        'subsample': trial.suggest_float('subsample', .8, 1, step=.05),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 30, step=.5)
    }

    gb_reg = LGBMRegressor(
        objective='regression',
        learning_rate=.3,
        n_jobs=12,
        importance_type='gain',
        random_state=7,
        **gb_params)

    score = np.mean(cross_val_score(gb_reg, X, y, scoring='r2', cv=KFold(n_splits=5, shuffle=True)))
    
    return score


def callback(study, trial):
    if trial.value > .9:
        study.stop()

In [15]:
study_gbm = optuna.create_study(direction='maximize', study_name='LightGBM')

[I 2023-10-01 03:08:37,504] A new study created in memory with name: LightGBM


In [16]:
study_gbm.optimize(gb_avg_inc, n_trials=100, n_jobs=-1, callbacks=[callback])

[I 2023-10-01 03:11:25,796] Trial 14 finished with value: 0.37873760412526425 and parameters: {'max_depth': 7, 'n_estimators': 90, 'min_child_samples': 60, 'subsample': 1.0, 'reg_alpha': 12.0}. Best is trial 14 with value: 0.37873760412526425.
[I 2023-10-01 03:11:29,577] Trial 11 finished with value: 0.3791868721865742 and parameters: {'max_depth': 11, 'n_estimators': 85, 'min_child_samples': 60, 'subsample': 0.8500000000000001, 'reg_alpha': 21.0}. Best is trial 11 with value: 0.3791868721865742.
[I 2023-10-01 03:11:42,204] Trial 3 finished with value: 0.3793906624162566 and parameters: {'max_depth': 9, 'n_estimators': 95, 'min_child_samples': 75, 'subsample': 0.8500000000000001, 'reg_alpha': 5.0}. Best is trial 3 with value: 0.3793906624162566.
[I 2023-10-01 03:11:53,726] Trial 7 finished with value: 0.379380292351738 and parameters: {'max_depth': 8, 'n_estimators': 100, 'min_child_samples': 95, 'subsample': 0.9, 'reg_alpha': 19.0}. Best is trial 3 with value: 0.3793906624162566.
[I 2

In [17]:
study_gbm.best_trial.value

0.38160741897457406

In [18]:
best_params = study_gbm.best_trial.params

In [19]:
inc_model = LGBMRegressor(
    objective='regression',
    learning_rate=.3,
    n_jobs=12,
    importance_type='gain',
    random_state=7,
    **best_params
).fit(train_data, inc_target)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1224
[LightGBM] [Info] Number of data points in the train set: 2752615, number of used features: 10
[LightGBM] [Info] Start training from score 1788.795321


In [20]:
import pickle

In [21]:
# pickle.dump(inc_model, open('..\\models\\inc_gb.pkl', 'wb'))

In [31]:
inc_model = pickle.load(open('..\\models\\inc_gb.pkl', 'rb'))

In [32]:
full_base_test = pd.read_feather(data_path + 'interim\\full_base_test.frt')

test_data = full_base_test.iloc[:, 4:]
inc_target = full_base_test.paid_avg_correct
cnt_target = full_base_test.transactions_count

In [35]:
pred_values = inc_model.predict(test_data)



In [39]:
r2_score(inc_target, pred_values)

0.4282866116902365

In [40]:
mean_absolute_error(inc_target, pred_values)

927.8435950863112