In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_next_27_hour,rides_next_28_hour,rides_next_29_hour,rides_next_30_hour,rides_next_31_hour,rides_next_32_hour,rides_next_33_hour,rides_next_34_hour,rides_next_35_hour,rides_next_36_hour
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,10.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,1.0,2.0,5.0,2.0,2.0,2.0
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,2.0,0.0,5.0,4.0,1.0,8.0
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,2.0,1.0,1.0,0.0,1.0,1.0,7.0,2.0,2.0,3.0
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0
111212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,3.0,0.0
111213,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0
111214,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0


In [4]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    targets_columns_names=[c for c in df.columns if c.startswith('rides_next_')]
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(40713, 674)
y_train.shape=(40713, 36)
X_test.shape=(70503, 674)
y_test.shape=(70503, 36)


In [12]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [14]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-07-17 12:39:50,442] A new study created in memory with name: no-name-1c17f8e9-34cb-4b4b-81f8-943375aff886


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the 

[I 2024-07-17 12:46:18,662] Trial 0 finished with value: 0.533186114866515 and parameters: {'num_leaves': 147, 'feature_fraction': 0.33332047584078583, 'bagging_fraction': 0.3491548051843033, 'min_child_samples': 86}. Best is trial 0 with value: 0.533186114866515.


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the 

[I 2024-07-17 12:51:11,819] Trial 1 finished with value: 0.5339997149394197 and parameters: {'num_leaves': 75, 'feature_fraction': 0.30704044988391205, 'bagging_fraction': 0.29198803455079414, 'min_child_samples': 7}. Best is trial 0 with value: 0.533186114866515.


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the 

[I 2024-07-17 12:57:32,043] Trial 2 finished with value: 0.5339056303561166 and parameters: {'num_leaves': 180, 'feature_fraction': 0.2786006290875527, 'bagging_fraction': 0.8316184441692502, 'min_child_samples': 3}. Best is trial 0 with value: 0.533186114866515.


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the 

[I 2024-07-17 13:06:37,686] Trial 3 finished with value: 0.5325533037852392 and parameters: {'num_leaves': 216, 'feature_fraction': 0.40714475603393296, 'bagging_fraction': 0.8217872413027798, 'min_child_samples': 32}. Best is trial 3 with value: 0.5325533037852392.


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the 

[I 2024-07-17 13:16:14,975] Trial 4 finished with value: 0.5343043013401796 and parameters: {'num_leaves': 183, 'feature_fraction': 0.6388096266656496, 'bagging_fraction': 0.3133340259110406, 'min_child_samples': 89}. Best is trial 3 with value: 0.5325533037852392.


In [18]:
best_params = study.best_trial.params
print(f'Best params: {best_params}')

Best params: {'num_leaves': 216, 'feature_fraction': 0.40714475603393296, 'bagging_fraction': 0.8217872413027798, 'min_child_samples': 32}


In [19]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)  

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

[LightGBM] [Info] Total Bins 11620
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 675
[LightGBM] [Info] Start training from score 0.416796
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

[LightGBM] [Info] Total Bins 11620
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 675
[LightGBM] [Info] Start training from score 0.287181
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

[LightGBM] [Info] Total Bins 11620
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 675
[LightGBM] [Info] Start training from score 0.155110
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated

In [20]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'Test MAE: {test_mae:.4f}')

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the 