In [24]:
import pandas as pd
import numpy as np
from datetime import datetime
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from feature_engine.timeseries.forecasting import LagFeatures
from sklearn.metrics import root_mean_squared_error

pd.set_option('display.max_columns', None)

In [65]:
def get_metrics(list_of_tuples_of_models, train_data, valid_data, target_col):
    dict_models_metrics = {
        'model' : [],
        'desc' : [],
        'rmse_train' : [],
        'rmse_valid' : [],
        'diff_rmse' : []
    }
    for model, features, desc in list_of_tuples_of_models:
        dict_models_metrics['model'].append(model.__class__)
        dict_models_metrics['desc'].append(desc)
        (
            dict_models_metrics['rmse_train'].append(
                root_mean_squared_error(
                    train_data[target_col],
                    model.predict(train_data[features])
                )
            )
        )
        (
            dict_models_metrics['rmse_valid'].append(
                root_mean_squared_error(
                    valid_data[target_col],
                    model.predict(valid_data[features])
                )
            )
        )

    dict_models_metrics['diff_rmse'] = [ t - v for t, v in
        zip(dict_models_metrics['rmse_train'], dict_models_metrics['rmse_valid'])]

    return pd.DataFrame(dict_models_metrics)


# Boosting

In [56]:
df_train = pd.read_parquet('../data/aggregated/full_train.parquet.gzip').sort_values('year_month_day', ascending=True)
df_valid = pd.read_parquet('../data/aggregated/full_valid.parquet.gzip').sort_values('year_month_day', ascending=True)

In [57]:
variables = list(df_train.drop(columns=['year_month_day']).columns)

In [58]:
lag = LagFeatures(variables = variables, periods=[1,2,3,4,5,6,7], fill_value=-1)

In [59]:
df_train_lag = lag.fit_transform(df_train)
df_valid_lag = lag.fit_transform(df_valid)

## LGBM

In [60]:
# TODO: Tune

clf = LGBMRegressor(
    boosting_type='gbdt',
    max_depth=5,
    n_estimators=200,
    learning_rate=0.01,
    random_state=777,
    min_child_samples=3
)

# TODO: Tune

clf_ = LGBMRegressor(
    boosting_type='gbdt',
    max_depth=5,
    n_estimators=200,
    learning_rate=0.01,
    random_state=777,
    min_child_samples=3
)

In [61]:
clf.fit(df_train.drop(columns=['year_month_day', 'qty_travels']), df_train['qty_travels'])
clf_.fit(df_train_lag.drop(columns=['year_month_day', 'qty_travels']), df_train_lag['qty_travels'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 30, number of used features: 8
[LightGBM] [Info] Start training from score 96120.133333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 729
[LightGBM] [Info] Number of data points in the train set: 30, number of used features: 71
[LightGBM] [Info] Start training from score 96120.133333


In [62]:
clf_2 = DecisionTreeRegressor(random_state=777, max_depth=20, min_samples_leaf=2)
clf_2_ = DecisionTreeRegressor(random_state=777, max_depth=20, min_samples_leaf=2)

In [63]:
clf_2.fit(df_train.drop(columns=['year_month_day', 'qty_travels']), df_train['qty_travels'])
clf_2_.fit(df_train_lag.drop(columns=['year_month_day', 'qty_travels']), df_train_lag['qty_travels'])

In [67]:
get_metrics([
        (clf, clf.feature_names_in_, 'no lag features'),
        (clf_, clf_.feature_names_in_, 'with lag features'),
        (clf_2, clf_2.feature_names_in_, 'no lag features'),
        (clf_2_, clf_2_.feature_names_in_, 'with lag features')
    ],
    df_train_lag,
    df_valid_lag,
    'qty_travels'
)

Unnamed: 0,model,desc,rmse_train,rmse_valid,diff_rmse
0,<class 'lightgbm.sklearn.LGBMRegressor'>,no lag features,2663.380477,3685.26076,-1021.880283
1,<class 'lightgbm.sklearn.LGBMRegressor'>,with lag features,2221.286328,7215.912097,-4994.625769
2,<class 'sklearn.tree._classes.DecisionTreeRegr...,no lag features,2110.670768,4747.537283,-2636.866515
3,<class 'sklearn.tree._classes.DecisionTreeRegr...,with lag features,1653.478077,6031.111856,-4377.63378


# Linear Regression