In [5]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import sys
sys.path.insert(0,'E:/repos/my_projects/taxi_demand_project')
from src.paths import TRANSFORMED_DATA_DIR
import pandas as pd
from src.data_split import train_test_split

In [2]:
ts_data = pd.read_parquet(TRANSFORMED_DATA_DIR/'transformed_tabular_data.parquet')
ts_data

Unnamed: 0,rides_672_hr_before,rides_671_hr_before,rides_670_hr_before,rides_669_hr_before,rides_668_hr_before,rides_667_hr_before,rides_666_hr_before,rides_665_hr_before,rides_664_hr_before,rides_663_hr_before,...,rides_7_hr_before,rides_6_hr_before,rides_5_hr_before,rides_4_hr_before,rides_3_hr_before,rides_2_hr_before,rides_1_hr_before,PU_location,PU_hour,target_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,4,2022-01-29,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,4,2022-01-30,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,4,2022-01-31,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,4,2022-02-01,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,4,2022-02-02,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,199,2022-12-27,0.0
88290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,199,2022-12-28,0.0
88291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,199,2022-12-29,0.0
88292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,199,2022-12-30,0.0


In [3]:
X_train,X_test,y_train,y_test= train_test_split(ts_data=ts_data,
                                cut_off_year=2022,
                                cut_off_month=6, cut_off_day= 1,
                                target_column='target_next_hour')


2022-06-01


In [8]:
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import mean_absolute_error as mae
import optuna
import numpy as np
from src.model import training_pipeline

In [17]:
def objective(trial : optuna.trial.Trial)-> float:

    objective_hyperparameters= {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100), 
    }

    ts_splits = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_split, val_split in ts_splits.split(X_train):
        
        X_train_, X_val_ = X_train.iloc[train_split,:], X_train.iloc[val_split,:]
        y_train_, y_val_ = y_train.iloc[train_split], y_train.iloc[val_split]

        pipeline = training_pipeline(**objective_hyperparameters)
        pipeline.fit(X_train_,y_train_)

        preds = pipeline.predict(X_val_)

        split_mae = mae(y_pred=preds,y_true=y_val_)

        scores.append(split_mae)

    return np.array(scores).mean()

In [18]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials=4)

[I 2023-06-13 23:04:04,815] A new study created in memory with name: no-name-6d120d7b-0296-416b-bf3e-1ab6ae3798e7


[I 2023-06-13 23:04:37,712] Trial 0 finished with value: 0.6199376337989743 and parameters: {'num_leaves': 183, 'feature_fraction': 0.5420734063472846, 'bagging_fraction': 0.43401877324410737, 'min_child_samples': 100}. Best is trial 0 with value: 0.6199376337989743.
[I 2023-06-13 23:05:10,296] Trial 1 finished with value: 0.6199376337989743 and parameters: {'num_leaves': 50, 'feature_fraction': 0.5601892708422835, 'bagging_fraction': 0.33180524332040395, 'min_child_samples': 17}. Best is trial 0 with value: 0.6199376337989743.
[I 2023-06-13 23:05:39,624] Trial 2 finished with value: 0.6199376337989743 and parameters: {'num_leaves': 204, 'feature_fraction': 0.6864206505134138, 'bagging_fraction': 0.48154557655841757, 'min_child_samples': 28}. Best is trial 0 with value: 0.6199376337989743.
[I 2023-06-13 23:06:11,718] Trial 3 finished with value: 0.6199376337989743 and parameters: {'num_leaves': 66, 'feature_fraction': 0.8155964359478809, 'bagging_fraction': 0.6416894801219963, 'min_chi

In [19]:
params = study.best_trial.params
best_pipeline = training_pipeline(**params)

In [20]:
best_pipeline.fit(X_train,y_train)

In [21]:
preds= best_pipeline.predict(X_test)

print(mae(y_true=y_test,y_pred=preds))

2.5975496881505826


In [37]:
%reload_ext autoreload
%autoreload 2
from src.plot import plot_one_example


In [44]:

plot_one_example(X_test,
                       y_test,
                       18940,
                       prediction=pd.Series(preds))