In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')

df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,1.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,3.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,10.0,2022-01-31,4,3.0
3,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,1.0,1.0,3.0,2022-02-01,4,3.0
4,1.0,1.0,1.0,1.0,1.0,1.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,3.0,4.0,4.0,3.0,2022-02-02,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88289,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-27,199,1.0
88290,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-28,199,1.0
88291,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-29,199,1.0
88292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-30,199,1.0


In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df, 
                                                    cutoff_date =  datetime(2022, 6, 1, 0, 0, 0),
                                                    target_column_name='target_rides_next_hour')

print(f'{X_train.shape = }')
print(f'{y_train.shape = }')
print(f'{X_test.shape = }')
print(f'{y_test.shape = }')

X_train.shape = (32226, 674)
y_train.shape = (32226,)
X_test.shape = (56068, 674)
y_test.shape = (56068,)


In [7]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:

    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits = 4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)
        scores.append(mae)

    return np.array(scores).mean()


In [8]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 5)

[I 2024-12-25 17:08:55,978] A new study created in memory with name: no-name-5cd7f565-b96f-4798-b191-fbcb044a7c45
[I 2024-12-25 17:09:10,097] Trial 0 finished with value: 1.463365300616275 and parameters: {'num_leaves': 136, 'feature_fraction': 0.2724149434192722, 'bagging_fraction': 0.361689417858555, 'min_child_samples': 24}. Best is trial 0 with value: 1.463365300616275.
[I 2024-12-25 17:09:27,762] Trial 1 finished with value: 1.5180641969911326 and parameters: {'num_leaves': 60, 'feature_fraction': 0.769984867999453, 'bagging_fraction': 0.8759284031985877, 'min_child_samples': 49}. Best is trial 0 with value: 1.463365300616275.
[I 2024-12-25 17:09:39,089] Trial 2 finished with value: 1.3897491681807264 and parameters: {'num_leaves': 180, 'feature_fraction': 0.25910223027700435, 'bagging_fraction': 0.5443423961005954, 'min_child_samples': 94}. Best is trial 2 with value: 1.3897491681807264.
[I 2024-12-25 17:10:09,578] Trial 3 finished with value: 1.3644597013219146 and parameters: {

In [9]:
best_params = study.best_trial.params
print(f'{best_params = }')

best_params = {'num_leaves': 252, 'feature_fraction': 0.8247812183678513, 'bagging_fraction': 0.7659316288447655, 'min_child_samples': 98}


In [10]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [11]:
prediction = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, prediction)
print(f'Test MAE: {test_mae:.2f}')

Test MAE: 2.48


In [12]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets = y_test,
    example_id=2979,
    predictions = pd.Series(prediction)
)

In [14]:
plot_one_sample(
    features=X_test,
    targets = y_test,
    example_id=1979,
    predictions = pd.Series(prediction)
)