In [1]:

import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')


In [2]:
from datetime import datetime, timezone
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df, cutoff_date=datetime(2022, 6, 1, 0, 0, 0, tzinfo=timezone.utc), target_column_name='target_rides_next_hour')

print(f'{X_train.shape = }')
print(f'{y_train.shape = }')
print(f'{X_test.shape = }')
print(f'{y_test.shape = }')


X_train.shape = (31980, 674)
y_train.shape = (31980,)
X_test.shape = (55640, 674)
y_test.shape = (55640,)


In [4]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:

    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits = 4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)
        scores.append(mae)

    return np.array(scores).mean()


In [5]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 5)

[I 2025-05-04 13:36:11,898] A new study created in memory with name: no-name-fd637bf5-9787-4c8e-8a13-04fb2314bd92
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['average_rides_last_4_weeks'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['average_rides_last_4_weeks'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['average_rides_la

In [6]:
best_params = study.best_trial.params
print(f'{best_params = }')

best_params = {'num_leaves': 160, 'feature_fraction': 0.9307134958795522, 'bagging_fraction': 0.6794068575721779, 'min_child_samples': 27}


In [7]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [8]:
prediction = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, prediction)
print(f'Test MAE: {test_mae:.2f}')

Test MAE: 2.46


In [10]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets = y_test,
    example_id=100,
    predictions = pd.Series(prediction)
)



'H' is deprecated and will be removed in a future version, please use 'h' instead.

