In [105]:
! pip install lightgbm --config-settings=cmake.define.USE_OPENMP=OFF



# Прогнозирование заказов такси

Компания «Чётенькое такси» собрала исторические данные о заказах такси в аэропортах. Чтобы привлекать больше водителей в период пиковой нагрузки, нужно спрогнозировать количество заказов такси на следующий час. Необходимо построить модель для прогнозирования стоимости заказа такси, при значении метрики RMSE на тестовой выборке не больее 48.

# Импорты

In [106]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from catboost import *
import lightgbm as lgb

from time import time

from sklearn.model_selection import KFold

import optuna
from sklearn.dummy import DummyRegressor


RANDOM_STATE = 1220

# Предобработка данных

In [107]:
df = pd.read_csv('./taxi.csv', index_col=[0], parse_dates=[0])

In [108]:
df.sort_index(inplace=True)


Для анализа временного ряда выполнил ресемплирование в 1 час и посмотрим изменение кривой

In [52]:
df = df.resample('1H').sum()

In [112]:
def make_features(data, max_lag, rolling_mean_size):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek
    
    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)

    data['rolling_mean'] = data['num_orders'].shift(1).rolling(rolling_mean_size).mean()

In [113]:
make_features(df, 100, 200)

train, test = train_test_split(df, shuffle=False, test_size=0.1)
train = train.dropna()



  data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)
  data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)
  data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)
  data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)
  data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)
  data['rolling_mean'] = data['num_orders'].shift(1).rolling(rolling_mean_size).mean()


In [114]:
def info_df(df):
    print('------------------------------')
    print('| Информация о наборе данных |')
    print('------------------------------')
    df.info()
    print('-----------------------------------------')
    print('| Первые и последние 5 строчек датасета |')
    print('-----------------------------------------')
    display(df)
    print('--------------------')
    print('| Сумма дубликатов |')
    print('--------------------')
    print(df.duplicated().sum())
    for i in df.select_dtypes(include='object').columns.to_list():
        print('--------------------------------------')
        print(f'| Уникальные значения признака {i} |')
        print('--------------------------------------')
        print(df[i].unique())

In [115]:
info_df(train)

------------------------------
| Информация о наборе данных |
------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23646 entries, 2018-03-02 09:20:00 to 2018-08-13 14:10:00
Columns: 106 entries, num_orders to rolling_mean
dtypes: float64(101), int64(5)
memory usage: 19.3 MB
-----------------------------------------
| Первые и последние 5 строчек датасета |
-----------------------------------------


Unnamed: 0_level_0,num_orders,year,month,day,dayofweek,lag_1,lag_2,lag_3,lag_4,lag_5,...,lag_92,lag_93,lag_94,lag_95,lag_96,lag_97,lag_98,lag_99,lag_100,rolling_mean
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-02 09:20:00,9,2018,3,2,4,7.0,6.0,15.0,9.0,12.0,...,14.0,6.0,9.0,8.0,9.0,7.0,5.0,6.0,9.0,9.105
2018-03-02 09:30:00,7,2018,3,2,4,9.0,7.0,6.0,15.0,9.0,...,12.0,14.0,6.0,9.0,8.0,9.0,7.0,5.0,6.0,9.105
2018-03-02 09:40:00,11,2018,3,2,4,7.0,9.0,7.0,6.0,15.0,...,13.0,12.0,14.0,6.0,9.0,8.0,9.0,7.0,5.0,9.070
2018-03-02 09:50:00,5,2018,3,2,4,11.0,7.0,9.0,7.0,6.0,...,10.0,13.0,12.0,14.0,6.0,9.0,8.0,9.0,7.0,8.985
2018-03-02 10:00:00,6,2018,3,2,4,5.0,11.0,7.0,9.0,7.0,...,7.0,10.0,13.0,12.0,14.0,6.0,9.0,8.0,9.0,8.910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-08-13 13:30:00,20,2018,8,13,0,34.0,25.0,22.0,16.0,11.0,...,18.0,21.0,13.0,22.0,20.0,27.0,32.0,21.0,26.0,20.600
2018-08-13 13:40:00,12,2018,8,13,0,20.0,34.0,25.0,22.0,16.0,...,24.0,18.0,21.0,13.0,22.0,20.0,27.0,32.0,21.0,20.635
2018-08-13 13:50:00,6,2018,8,13,0,12.0,20.0,34.0,25.0,22.0,...,18.0,24.0,18.0,21.0,13.0,22.0,20.0,27.0,32.0,20.570
2018-08-13 14:00:00,13,2018,8,13,0,6.0,12.0,20.0,34.0,25.0,...,14.0,18.0,24.0,18.0,21.0,13.0,22.0,20.0,27.0,20.495


--------------------
| Сумма дубликатов |
--------------------
0


In [121]:
features_train = train.drop('num_orders', axis=1)
target_train = train['num_orders']

features_test = test.drop('num_orders', axis=1)
target_test = test['num_orders']

In [122]:
vif_data = pd.DataFrame()
vif_data["feature"] = features_train.columns

# вычисление VIF для каждого признака
vif_data["VIF"] = [variance_inflation_factor(features_train.values, i) \
                          for i in range(len(features_train.columns))]

print(vif_data)

KeyboardInterrupt: 

In [60]:
VIF_features_drop = vif_data.query('VIF > 9.0')['feature']
VIF_features_drop

0              year
1             month
104    rolling_mean
Name: feature, dtype: object

In [61]:
features_train_vif = features_train.drop(VIF_features_drop.values, axis=1)
features_test_vif = features_test.drop(VIF_features_drop.values, axis=1)

In [62]:
def metrics(target, prediction, prediction_proba):
    print("F-beta:",fbeta_score(target,prediction,average='macro',beta=2))
    print("AUC-ROC:", roc_auc_score(target, prediction_proba[:, 1]))
    
    fpr, tpr, thresholds = roc_curve(target, prediction_proba[:, 1])
    cm_matrix = pd.DataFrame(data=confusion_matrix(target, prediction), 
                                columns=['Actual Positive:1', 'Actual Negative:0'], 
                                index=['Predict Positive:1', 'Predict Negative:0'])
    tp = cm_matrix['Actual Positive:1']['Predict Positive:1']
    fp = cm_matrix['Actual Positive:1']['Predict Negative:0']
    fn = cm_matrix['Actual Negative:0']['Predict Positive:1']
    tn = cm_matrix['Actual Negative:0']['Predict Negative:0']
    print('Precision =', round(tp / (tp + fp), 3))
    print('Recall = ', round(tp / (tp + fn), 3))

    fig, axes = plt.subplots(ncols=2, figsize=(15, 5))

    sns.heatmap(cm_matrix, annot=True, fmt='d', 
                cmap=sns.diverging_palette(220, 10, as_cmap=True), ax=axes[0])
    axes[1].plot([0, 1], [0, 1], linestyle='--')
    axes[1].plot(fpr, tpr)


    axes[0].title.set_text('Матрица ошибок')
    axes[1].title.set_text('ROC-кривая')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.tight_layout()
    plt.show()

In [63]:
## Linear Regression

In [64]:
model_lr = LinearRegression(n_jobs=10, copy_X=True)

start_time_lr = time()

model_lr.fit(features_train_vif, target_train)

end_time_lr = time()

lr_model_time_fit = (end_time_lr-start_time_lr) / 60

In [65]:
print('Время обучение модели: {0:.5f} сек.'. format(lr_model_time_fit))

Время обучение модели: 0.00060 сек.


In [66]:
start_time_lr = time()
predicted = model_lr.predict(features_train_vif)
end_time_lr = time()

lr_model_time_predict = (end_time_lr-start_time_lr) / 60

In [67]:
model_lr_cv = LinearRegression(n_jobs=10, copy_X=True)

start_time_lr = time()
folds = KFold(n_splits = 4, shuffle = True, random_state = RANDOM_STATE)
scores = cross_val_score(model_lr_cv, features_train_vif, target_train, scoring='neg_root_mean_squared_error', cv=4)
end_time_lr = time()

lr_model_time_cv = (end_time_lr-start_time_lr) / 60

In [96]:
print("RMSE кросс валидации: {0:.2f}.". \
    format(scores.max() * -1))

RMSE кросс валидации: 18.49.


In [97]:
RMSE_LR = scores.max() * -1

## CatBoost

In [69]:
cat_features = features_train_vif.select_dtypes(include='object').columns.to_list()

In [70]:
start_time_cat = time()
cat_model = CatBoostRegressor(loss_function='RMSE',
                                      depth=10,
                                      learning_rate=0.1,
                                      random_state=RANDOM_STATE,
                                      task_type="CPU",
                                      cat_features=cat_features,
                                      ).fit(features_train_vif, target_train,
                                            plot=True, verbose=False)
end_time_cat = time()

cat_model_time_fit = (end_time_cat-start_time_cat) / 60

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [71]:
print('Время обучение модели: {0:.5f} сек.'. format(cat_model_time_fit))

Время обучение модели: 0.36860 сек.


In [72]:
start_time_lr = time()
predicted = cat_model.predict(features_train_vif)
end_time_lr = time()

cat_model_time_predict = (end_time_lr-start_time_lr) / 60

In [73]:
print('Время прогнозирования модели: {0:.5f} сек.'. format(cat_model_time_predict))

Время прогнозирования модели: 0.00007 сек.


In [74]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        
    }

    model = CatBoostRegressor(**params, silent=True)
    model.fit(features_train, target_train, cat_features)
    predictions = model.predict(features_train)
    rmse = mean_squared_error(target_train, predictions, squared=True)
    return rmse

In [75]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2024-02-27 16:19:04,591] A new study created in memory with name: no-name-4fcf164d-a219-4278-b23a-e9e37402444d
[I 2024-02-27 16:19:05,070] Trial 0 finished with value: 413.17439382007194 and parameters: {'learning_rate': 0.09980634924820068, 'depth': 1, 'min_data_in_leaf': 50}. Best is trial 0 with value: 413.17439382007194.
[I 2024-02-27 16:19:27,579] Trial 1 finished with value: 274.03754576956277 and parameters: {'learning_rate': 0.005390470982656902, 'depth': 10, 'min_data_in_leaf': 31}. Best is trial 1 with value: 274.03754576956277.
[W 2024-02-27 16:19:27,911] Trial 2 failed with parameters: {'learning_rate': 0.0032037530504706096, 'depth': 6, 'min_data_in_leaf': 43} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/Users/sergeidolin/anaconda3/envs/ds_practicum_env/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/n5/30cd9v_56sz_qrnt8mp_9df40000

KeyboardInterrupt: 

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value**0.5)

Best hyperparameters: {'learning_rate': 0.09691752753013556, 'depth': 10, 'min_data_in_leaf': 41}
Best RMSE: 0.20750604127619893


In [76]:
params_opt = {'learning_rate': 0.09691752753013556, 'depth': 10, 'min_data_in_leaf': 41}

In [77]:
cv_params = cat_model.get_params()
cv_params.update(study.best_params)
cv_data = cv(
    Pool(features_train_vif, target_train, cat_features),
    cv_params,
    plot=True, verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 22.02568947
bestIteration = 999

Training on fold [1/3]

bestTest = 23.68664335
bestIteration = 999

Training on fold [2/3]

bestTest = 23.25624209
bestIteration = 999



In [78]:
print('RMSE модели (кросс-валидация): {:.2f}'.format(
    np.min(cv_data['test-RMSE-mean'])
))

RMSE модели (кросс-валидация): 22.99


In [79]:
RMSE_CB = np.min(cv_data['test-RMSE-mean'])

## LithGBM

In [81]:
lgtrain = lgb.Dataset(features_train_vif,target_train ,feature_name = "auto")

In [82]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,
    "num_iterations": 1000
}

start_time_lgb = time()
lgb_model = lgb.LGBMRegressor(**hyper_params).fit(features_train_vif, target_train)
end_time_lgb = time()

lgb_model_time_fit = (end_time_lgb-start_time_lgb) / 60





In [83]:
print('Время обучение модели: {0:.5f} сек.'. format(lgb_model_time_fit))

Время обучение модели: 0.05841 сек.


In [85]:
start_time_lgb = time()
lgb_pred = lgb_model.predict(features_train_vif, num_iteration=lgb_model.best_iteration_)
end_time_lgb = time()

lgb_model_time_pred = (end_time_lgb-start_time_lgb) / 60



In [86]:
print('Время прогнозирования модели: {0:.5f} сек.'. format(lgb_model_time_pred))

Время прогнозирования модели: 0.00378 сек.


In [88]:
def objective_lgb(trial):
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'feature_fraction': trial.suggest_float("feature_fraction", 0.1, 0.9, log=True),
        'bagging_fraction': trial.suggest_float("bagging_fraction", 0.1, 0.9, log=True),
        'bagging_freq': trial.suggest_int("bagging_freq", 1, 10),
        'verbose': 0,
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "num_leaves": trial.suggest_int("num_leaves", 64, 256),
        "num_iterations": 1000
    }

    model = lgb.LGBMRegressor(**params, silent=True)
    model.fit(features_train_vif, target_train)
    predictions = model.predict(features_train_vif)
    rmse = mean_squared_error(target_train, predictions, squared=True)
    return rmse

In [89]:
study = optuna.create_study(direction="minimize")
study.optimize(objective_lgb, n_trials=50)

[I 2024-02-27 16:30:53,187] A new study created in memory with name: no-name-9d4c74ea-3da6-4fe8-849e-be6b3a97e047




[I 2024-02-27 16:30:53,639] Trial 0 finished with value: 593.0167139851204 and parameters: {'learning_rate': 0.0017375740524235527, 'feature_fraction': 0.1489671425684063, 'bagging_fraction': 0.29358221555802666, 'bagging_freq': 6, 'max_depth': 4, 'num_leaves': 94}. Best is trial 0 with value: 593.0167139851204.




[I 2024-02-27 16:30:53,942] Trial 1 finished with value: 350.90791465325077 and parameters: {'learning_rate': 0.016356075091709943, 'feature_fraction': 0.14212062910790407, 'bagging_fraction': 0.10765144974446382, 'bagging_freq': 6, 'max_depth': 4, 'num_leaves': 255}. Best is trial 1 with value: 350.90791465325077.








[I 2024-02-27 16:30:55,281] Trial 2 finished with value: 15.659720896278133 and parameters: {'learning_rate': 0.0856482798939135, 'feature_fraction': 0.5250523046789666, 'bagging_fraction': 0.20231735449990815, 'bagging_freq': 1, 'max_depth': 8, 'num_leaves': 151}. Best is trial 2 with value: 15.659720896278133.








[I 2024-02-27 16:30:55,515] Trial 3 finished with value: 927.1833826292478 and parameters: {'learning_rate': 0.0012323855887809724, 'feature_fraction': 0.2708849614119421, 'bagging_fraction': 0.3903964499277411, 'bagging_freq': 2, 'max_depth': 1, 'num_leaves': 74}. Best is trial 2 with value: 15.659720896278133.




[I 2024-02-27 16:30:56,288] Trial 4 finished with value: 630.08699243749 and parameters: {'learning_rate': 0.0013193848054232414, 'feature_fraction': 0.26629046644822946, 'bagging_fraction': 0.11808360947149751, 'bagging_freq': 10, 'max_depth': 7, 'num_leaves': 184}. Best is trial 2 with value: 15.659720896278133.




[I 2024-02-27 16:30:56,559] Trial 5 finished with value: 648.5041381406788 and parameters: {'learning_rate': 0.0018310753829833683, 'feature_fraction': 0.24008786635938073, 'bagging_fraction': 0.3109727022129628, 'bagging_freq': 9, 'max_depth': 2, 'num_leaves': 167}. Best is trial 2 with value: 15.659720896278133.








[I 2024-02-27 16:30:57,294] Trial 6 finished with value: 535.4104793897169 and parameters: {'learning_rate': 0.0019657495210959356, 'feature_fraction': 0.18235256227206728, 'bagging_fraction': 0.1349728581050651, 'bagging_freq': 10, 'max_depth': 7, 'num_leaves': 182}. Best is trial 2 with value: 15.659720896278133.




[I 2024-02-27 16:30:57,490] Trial 7 finished with value: 469.8266272069755 and parameters: {'learning_rate': 0.026802750900146187, 'feature_fraction': 0.20387199043639032, 'bagging_fraction': 0.41600418072901363, 'bagging_freq': 8, 'max_depth': 1, 'num_leaves': 139}. Best is trial 2 with value: 15.659720896278133.




[I 2024-02-27 16:30:58,743] Trial 8 finished with value: 3.7340638053542126 and parameters: {'learning_rate': 0.09397372663737612, 'feature_fraction': 0.4699816178037992, 'bagging_fraction': 0.41065118606820555, 'bagging_freq': 8, 'max_depth': 7, 'num_leaves': 69}. Best is trial 8 with value: 3.7340638053542126.




[I 2024-02-27 16:30:59,078] Trial 9 finished with value: 652.4025433406213 and parameters: {'learning_rate': 0.001633257702272203, 'feature_fraction': 0.35494647451187916, 'bagging_fraction': 0.19438608181779982, 'bagging_freq': 2, 'max_depth': 2, 'num_leaves': 131}. Best is trial 8 with value: 3.7340638053542126.








[I 2024-02-27 16:31:01,781] Trial 10 finished with value: 0.3465054132893705 and parameters: {'learning_rate': 0.0771950269806256, 'feature_fraction': 0.865239284617955, 'bagging_fraction': 0.7663555339636157, 'bagging_freq': 7, 'max_depth': 10, 'num_leaves': 229}. Best is trial 10 with value: 0.3465054132893705.




[I 2024-02-27 16:31:04,377] Trial 11 finished with value: 0.15074084671746557 and parameters: {'learning_rate': 0.0907518616031401, 'feature_fraction': 0.8470442045999809, 'bagging_fraction': 0.789253821260621, 'bagging_freq': 7, 'max_depth': 10, 'num_leaves': 249}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:07,409] Trial 12 finished with value: 4.415511205195841 and parameters: {'learning_rate': 0.03685232985717546, 'feature_fraction': 0.8798211838776712, 'bagging_fraction': 0.8982945940938163, 'bagging_freq': 4, 'max_depth': 10, 'num_leaves': 251}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:12,461] Trial 13 finished with value: 78.81416810158771 and parameters: {'learning_rate': 0.005809627868173902, 'feature_fraction': 0.8569551336381922, 'bagging_fraction': 0.8859980156097166, 'bagging_freq': 5, 'max_depth': 10, 'num_leaves': 219}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:14,524] Trial 14 finished with value: 3.694319676787289 and parameters: {'learning_rate': 0.05265942326430979, 'feature_fraction': 0.6397276270733312, 'bagging_fraction': 0.6020866488940508, 'bagging_freq': 7, 'max_depth': 9, 'num_leaves': 219}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:17,333] Trial 15 finished with value: 90.49469444689308 and parameters: {'learning_rate': 0.008652001999149908, 'feature_fraction': 0.6745222090312717, 'bagging_fraction': 0.6293453374192653, 'bagging_freq': 4, 'max_depth': 9, 'num_leaves': 219}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:18,151] Trial 16 finished with value: 27.302448757189797 and parameters: {'learning_rate': 0.054450351418787964, 'feature_fraction': 0.37880360895234194, 'bagging_fraction': 0.6027138988511594, 'bagging_freq': 7, 'max_depth': 5, 'num_leaves': 235}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:20,612] Trial 17 finished with value: 26.084939749058407 and parameters: {'learning_rate': 0.019693998464424285, 'feature_fraction': 0.6644217324443126, 'bagging_fraction': 0.7031023792898156, 'bagging_freq': 8, 'max_depth': 9, 'num_leaves': 199}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:21,924] Trial 18 finished with value: 212.95168048398142 and parameters: {'learning_rate': 0.005352383399685785, 'feature_fraction': 0.10242920346981534, 'bagging_fraction': 0.4924767288525625, 'bagging_freq': 5, 'max_depth': 10, 'num_leaves': 233}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:23,614] Trial 19 finished with value: 2.7918901333731703 and parameters: {'learning_rate': 0.056454925547513815, 'feature_fraction': 0.5251513976029006, 'bagging_fraction': 0.7012529689561584, 'bagging_freq': 7, 'max_depth': 8, 'num_leaves': 202}. Best is trial 11 with value: 0.15074084671746557.








[I 2024-02-27 16:31:25,165] Trial 20 finished with value: 146.66977182782145 and parameters: {'learning_rate': 0.012272728308225607, 'feature_fraction': 0.7674339191236877, 'bagging_fraction': 0.49803400485895294, 'bagging_freq': 4, 'max_depth': 6, 'num_leaves': 240}. Best is trial 11 with value: 0.15074084671746557.




[I 2024-02-27 16:31:26,793] Trial 21 finished with value: 3.0651755402952143 and parameters: {'learning_rate': 0.056840970961048835, 'feature_fraction': 0.5081518233414208, 'bagging_fraction': 0.7435635809471596, 'bagging_freq': 7, 'max_depth': 8, 'num_leaves': 201}. Best is trial 11 with value: 0.15074084671746557.








[I 2024-02-27 16:31:28,450] Trial 22 finished with value: 0.244187530772195 and parameters: {'learning_rate': 0.09900815476723601, 'feature_fraction': 0.5760798191138116, 'bagging_fraction': 0.7668889380361653, 'bagging_freq': 9, 'max_depth': 8, 'num_leaves': 203}. Best is trial 11 with value: 0.15074084671746557.








[I 2024-02-27 16:31:30,803] Trial 23 finished with value: 0.14000560250226532 and parameters: {'learning_rate': 0.09442025298586683, 'feature_fraction': 0.7280369526830297, 'bagging_fraction': 0.8171980010942692, 'bagging_freq': 9, 'max_depth': 10, 'num_leaves': 224}. Best is trial 23 with value: 0.14000560250226532.




[I 2024-02-27 16:31:32,487] Trial 24 finished with value: 12.788564467507094 and parameters: {'learning_rate': 0.03524984128909967, 'feature_fraction': 0.39276201814191697, 'bagging_fraction': 0.5137571834149225, 'bagging_freq': 9, 'max_depth': 9, 'num_leaves': 252}. Best is trial 23 with value: 0.14000560250226532.




[I 2024-02-27 16:31:34,195] Trial 25 finished with value: 0.24715581118921923 and parameters: {'learning_rate': 0.09820644481393534, 'feature_fraction': 0.61792023531736, 'bagging_fraction': 0.8734903818540034, 'bagging_freq': 9, 'max_depth': 8, 'num_leaves': 184}. Best is trial 23 with value: 0.14000560250226532.








[I 2024-02-27 16:31:36,886] Trial 26 finished with value: 8.831159699741503 and parameters: {'learning_rate': 0.03423405248500352, 'feature_fraction': 0.7432022998283796, 'bagging_fraction': 0.5650925511156895, 'bagging_freq': 10, 'max_depth': 10, 'num_leaves': 209}. Best is trial 23 with value: 0.14000560250226532.




[I 2024-02-27 16:31:38,899] Trial 27 finished with value: 0.8419913308324372 and parameters: {'learning_rate': 0.06845988766844238, 'feature_fraction': 0.5806596673133612, 'bagging_fraction': 0.7636894060678157, 'bagging_freq': 9, 'max_depth': 9, 'num_leaves': 168}. Best is trial 23 with value: 0.14000560250226532.




[I 2024-02-27 16:31:39,886] Trial 28 finished with value: 51.199434564531884 and parameters: {'learning_rate': 0.04422896441166193, 'feature_fraction': 0.43215840451318216, 'bagging_fraction': 0.32013923696015534, 'bagging_freq': 8, 'max_depth': 6, 'num_leaves': 243}. Best is trial 23 with value: 0.14000560250226532.








[I 2024-02-27 16:31:41,687] Trial 29 finished with value: 74.08335693368944 and parameters: {'learning_rate': 0.023390785817338423, 'feature_fraction': 0.7480910600709153, 'bagging_fraction': 0.2457648314831437, 'bagging_freq': 6, 'max_depth': 9, 'num_leaves': 121}. Best is trial 23 with value: 0.14000560250226532.




[I 2024-02-27 16:31:44,143] Trial 30 finished with value: 249.49782660866782 and parameters: {'learning_rate': 0.0036010533253703486, 'feature_fraction': 0.577822067896848, 'bagging_fraction': 0.4517528744112424, 'bagging_freq': 9, 'max_depth': 8, 'num_leaves': 99}. Best is trial 23 with value: 0.14000560250226532.




[I 2024-02-27 16:31:45,898] Trial 31 finished with value: 0.08731030141422554 and parameters: {'learning_rate': 0.09091284570172832, 'feature_fraction': 0.6033991422753926, 'bagging_fraction': 0.8440066648870547, 'bagging_freq': 9, 'max_depth': 7, 'num_leaves': 186}. Best is trial 31 with value: 0.08731030141422554.




[I 2024-02-27 16:31:46,658] Trial 32 finished with value: 13.045438342052615 and parameters: {'learning_rate': 0.0718589229606531, 'feature_fraction': 0.3259739304566716, 'bagging_fraction': 0.6615887182524721, 'bagging_freq': 10, 'max_depth': 5, 'num_leaves': 192}. Best is trial 31 with value: 0.08731030141422554.








[I 2024-02-27 16:31:48,695] Trial 33 finished with value: 0.04343176575415886 and parameters: {'learning_rate': 0.09774515812907832, 'feature_fraction': 0.7556892983168039, 'bagging_fraction': 0.7833320551140095, 'bagging_freq': 8, 'max_depth': 7, 'num_leaves': 212}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:31:50,375] Trial 34 finished with value: 2.1932139592298596 and parameters: {'learning_rate': 0.0680914103049404, 'feature_fraction': 0.7691794718607177, 'bagging_fraction': 0.8287869350540611, 'bagging_freq': 8, 'max_depth': 6, 'num_leaves': 215}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:31:52,028] Trial 35 finished with value: 16.79639892708816 and parameters: {'learning_rate': 0.040858015994765226, 'feature_fraction': 0.7086364585044391, 'bagging_fraction': 0.5526929603741615, 'bagging_freq': 6, 'max_depth': 7, 'num_leaves': 224}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:31:53,058] Trial 36 finished with value: 141.54757966969848 and parameters: {'learning_rate': 0.027764198711506118, 'feature_fraction': 0.8908646199650714, 'bagging_fraction': 0.6539593238614474, 'bagging_freq': 8, 'max_depth': 4, 'num_leaves': 174}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:31:53,616] Trial 37 finished with value: 98.58363802771326 and parameters: {'learning_rate': 0.0790876003893839, 'feature_fraction': 0.4505867646089104, 'bagging_fraction': 0.8009561981687341, 'bagging_freq': 10, 'max_depth': 3, 'num_leaves': 256}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:31:55,488] Trial 38 finished with value: 11.38490895508429 and parameters: {'learning_rate': 0.043709468533331734, 'feature_fraction': 0.8103648408985813, 'bagging_fraction': 0.6747647951634933, 'bagging_freq': 9, 'max_depth': 7, 'num_leaves': 157}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:31:56,341] Trial 39 finished with value: 59.21647153347435 and parameters: {'learning_rate': 0.06476494070540519, 'feature_fraction': 0.5217537095306747, 'bagging_fraction': 0.2506903788235142, 'bagging_freq': 8, 'max_depth': 5, 'num_leaves': 191}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:31:57,149] Trial 40 finished with value: 45.992606078010276 and parameters: {'learning_rate': 0.09923093996914675, 'feature_fraction': 0.28790832601734945, 'bagging_fraction': 0.14327598616137244, 'bagging_freq': 7, 'max_depth': 7, 'num_leaves': 148}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:31:58,835] Trial 41 finished with value: 0.24755829131727303 and parameters: {'learning_rate': 0.09959004990173087, 'feature_fraction': 0.5693875288428524, 'bagging_fraction': 0.7794327777795844, 'bagging_freq': 9, 'max_depth': 8, 'num_leaves': 210}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:32:00,381] Trial 42 finished with value: 1.0919490320603633 and parameters: {'learning_rate': 0.07887255076283269, 'feature_fraction': 0.6724341856225112, 'bagging_fraction': 0.8120962169091962, 'bagging_freq': 9, 'max_depth': 6, 'num_leaves': 243}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:32:02,223] Trial 43 finished with value: 4.828370259430995 and parameters: {'learning_rate': 0.04800369276157852, 'feature_fraction': 0.6189400717476461, 'bagging_fraction': 0.7269675109562168, 'bagging_freq': 10, 'max_depth': 8, 'num_leaves': 177}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:32:03,540] Trial 44 finished with value: 2.3517960373669284 and parameters: {'learning_rate': 0.08218408103679568, 'feature_fraction': 0.4847098528183585, 'bagging_fraction': 0.5563119854700035, 'bagging_freq': 8, 'max_depth': 7, 'num_leaves': 229}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:32:04,869] Trial 45 finished with value: 8.769849598091485 and parameters: {'learning_rate': 0.06214575585992205, 'feature_fraction': 0.2415053465585919, 'bagging_fraction': 0.33932350806329087, 'bagging_freq': 1, 'max_depth': 10, 'num_leaves': 191}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:32:07,216] Trial 46 finished with value: 0.2737311982297975 and parameters: {'learning_rate': 0.08552763370543673, 'feature_fraction': 0.8128152942513068, 'bagging_fraction': 0.8516949026989743, 'bagging_freq': 10, 'max_depth': 9, 'num_leaves': 209}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:32:10,067] Trial 47 finished with value: 314.87272330564247 and parameters: {'learning_rate': 0.0025436269312460527, 'feature_fraction': 0.7053633427007283, 'bagging_fraction': 0.35980412118924704, 'bagging_freq': 8, 'max_depth': 10, 'num_leaves': 228}. Best is trial 33 with value: 0.04343176575415886.




[I 2024-02-27 16:32:11,751] Trial 48 finished with value: 92.89241436376462 and parameters: {'learning_rate': 0.013981442272199666, 'feature_fraction': 0.5594044122253788, 'bagging_fraction': 0.6047594160669377, 'bagging_freq': 9, 'max_depth': 7, 'num_leaves': 198}. Best is trial 33 with value: 0.04343176575415886.








[I 2024-02-27 16:32:12,810] Trial 49 finished with value: 14.192163793103918 and parameters: {'learning_rate': 0.031470428458796613, 'feature_fraction': 0.16198211231652215, 'bagging_fraction': 0.8987786025999992, 'bagging_freq': 7, 'max_depth': 8, 'num_leaves': 245}. Best is trial 33 with value: 0.04343176575415886.




In [90]:
print('Best hyperparameters:', study.best_params)

Best hyperparameters: {'learning_rate': 0.09774515812907832, 'feature_fraction': 0.7556892983168039, 'bagging_fraction': 0.7833320551140095, 'bagging_freq': 8, 'max_depth': 7, 'num_leaves': 212}


In [91]:
params_opt_lgb = {'learning_rate': 0.09774515812907832, 'feature_fraction': 0.7556892983168039, 'bagging_fraction': 0.7833320551140095, 'bagging_freq': 8, 'max_depth': 7, 'num_leaves': 212}

In [92]:
start_time_cb = time()

params_opt_lgb.update({'metric': 'rmse',})
lgbm_cv_model = lgb.cv(params_opt_lgb,
                       train_set=lgtrain,
                       nfold=3)

end_time_cb = time()
    
lgbm_model_time_cv = (end_time_cb-start_time_cb) / 60



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18731
[LightGBM] [Info] Number of data points in the train set: 2516, number of used features: 102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18731
[LightGBM] [Info] Number of data points in the train set: 2516, number of used features: 102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18731
[LightGBM] [Info] Number of data points in the train set: 2516, number of used features: 102
[LightGBM] [Info] Start training from score 79.645072
[LightGBM] [Info] Start training from score 79.403816
[LightGBM] [Info] Star

In [93]:
pd.DataFrame(lgbm_cv_model)

Unnamed: 0,valid rmse-mean,valid rmse-stdv
0,36.443774,0.634140
1,34.512032,0.699364
2,32.880755,0.666584
3,31.425192,0.706029
4,30.123249,0.751103
...,...,...
95,22.039299,0.881823
96,22.032298,0.891001
97,22.029712,0.894373
98,22.021935,0.893774


In [94]:
print('RMSE модели (кросс-валидация): {:.2f}'.format(
    min(pd.DataFrame(lgbm_cv_model)['valid rmse-mean'])
))

RMSE модели (кросс-валидация): 22.02


In [95]:
RMSE_LGBM = min(pd.DataFrame(lgbm_cv_model)['valid rmse-mean'])

In [98]:
results = {
    'Model' : ['LinearRegression', 'CatBoost', 'LightGBM'],
    'Time Fit, sec' : pd.Series([lr_model_time_fit, cat_model_time_fit, lgb_model_time_fit]),
    'Time Pred' :pd.Series([lr_model_time_predict, cat_model_time_predict, lgb_model_time_pred]),
    'RMSE CV' :pd.Series([RMSE_LR, RMSE_CB, RMSE_LGBM])
    }
display(pd.DataFrame(results))

Unnamed: 0,Model,"Time Fit, sec",Time Pred,RMSE CV
0,LinearRegression,0.000601,0.000654,18.493066
1,CatBoost,0.368604,7.1e-05,22.989525
2,LightGBM,0.058413,0.003782,22.016649


In [100]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(features_train_vif, target_train)

In [103]:
lgb_model = lgb.LGBMRegressor(**params_opt_lgb).fit(features_train_vif, target_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001870 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18731
[LightGBM] [Info] Number of data points in the train set: 3774, number of used features: 102
[LightGBM] [Info] Start training from score 79.516958


In [104]:
print("RMSE dummy-модели: {0:.2f}.". \
      format((mean_squared_error(target_test, dummy_regr.predict(features_test_vif)))**0.5))
print("Средняя стоимость заказа (DUMMY): {0:.2f}". \
      format(dummy_regr.predict(features_test_vif).mean()))



print("RMSE CatBoost-модели: {0:.2f}.". \
      format((mean_squared_error(target_test, lgb_model.predict(features_test_vif)))**0.5))
print("Средняя стоимость заказа (CatBoost): {0:.2f}". \
      format(cat_model.predict(features_test_vif).mean()))

RMSE dummy-модели: 83.86.
Средняя стоимость заказа (DUMMY): 79.52
RMSE CatBoost-модели: 38.60.
Средняя стоимость заказа (CatBoost): 119.07
