In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.config as config
import pandas as pd

In [3]:
import hopsworks

project = hopsworks.login(
    project = config.HOPSWORKS_PROJECT_NAME,
    api_key_value = config.HOPSWORKS_API_KEY
)

feature_store = project.get_feature_store()

feature_group = feature_store.get_or_create_feature_group(
    name = config.FEATURE_GROUP_NAME,
    version = config.FEATURE_GROUP_VERSION,
)

2025-01-25 10:38:12,609 INFO: Initializing external client
2025-01-25 10:38:12,609 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-25 10:38:18,275 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207467


In [4]:
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1207467/fs/1196121/fv/time_series_hourly_feature_view/version/1


In [5]:
ts_data = feature_view.training_data(
    description = "Time-series hourly taxi rides"
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (17.54s) 




In [6]:
ts_data = ts_data[0]

In [7]:
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides,pickup_ts
0,2023-12-24 11:00:00+00:00,225,1.0,1703415600000
1,2024-10-22 09:00:00+00:00,133,2.0,1729587600000
2,2024-02-05 18:00:00+00:00,30,1.0,1707156000000
3,2024-10-29 05:00:00+00:00,167,1.0,1730178000000
4,2023-06-11 03:00:00+00:00,245,1.0,1686452400000
...,...,...,...,...
4548435,2023-03-08 11:00:00+00:00,151,53.0,1678273200000
4548436,2024-01-14 18:00:00+00:00,215,1.0,1705255200000
4548437,2024-11-26 20:00:00+00:00,261,18.0,1732651200000
4548438,2024-10-13 00:00:00+00:00,19,1.0,1728777600000


In [8]:
ts_data["pickup_hour"].dtype

dtype('O')

In [9]:
ts_data.sort_values(by = ["pickup_location_id", "pickup_hour"], inplace = True)
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides
1619435,2022-01-01 00:00:00+00:00,1,18.167367
5727554,2022-01-01 01:00:00+00:00,1,18.167367
2076052,2022-01-01 02:00:00+00:00,1,18.167367
2129813,2022-01-01 03:00:00+00:00,1,18.167367
3957303,2022-01-01 04:00:00+00:00,1,1.000000
...,...,...,...
3472092,2025-01-18 11:00:00+00:00,263,162.000000
3525934,2025-01-18 12:00:00+00:00,263,187.000000
3485698,2025-01-18 13:00:00+00:00,263,169.000000
3398328,2025-01-18 14:00:00+00:00,263,149.000000


In [10]:
type(ts_data)

pandas.core.frame.DataFrame

In [11]:
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'])

# Filter rows where 'pickup_hour' belongs to 2023 or 2024
filtered_ts_data = ts_data[ts_data['pickup_hour'].dt.year.isin([2023, 2024])]

In [12]:
from src.data import transform_ts_data_into_features_and_target

features, target = transform_ts_data_into_features_and_target(filtered_ts_data,
                                                              n_sez_len = 24*28,
                                                              step_size = 23)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = target

print(f'{features_and_target.shape = }')

100%|██████████| 260/260 [01:00<00:00,  4.32it/s]

features_and_target.shape = (186420, 675)





In [13]:
from datetime import date, timedelta
from pytz  import timezone
import pandas as pd
from src.data_split import train_test_split

cut_off_date = pd.to_datetime(date.today() - timedelta(days = 220*1))

print(f'{cut_off_date = }')

# Convert pickup_hour to UTC if needed
features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])

features_and_target['pickup_hour'] = features_and_target['pickup_hour'].dt.tz_convert('UTC')

# Localize cut_off_date to UTC
cut_off_date = cut_off_date.tz_localize('UTC')


X_train, y_train, X_test, y_test = train_test_split(features_and_target, 
                                                    cut_off_date, 
                                                    target_column_name = 'target_rides_next_hour'
                                                    )
print(f'{X_train.shape = }')
print(f'{y_train.shape = }')
print(f'{X_test.shape = }')
print(f'{y_test.shape = }')

cut_off_date = Timestamp('2024-06-12 00:00:00')
X_train.shape = (135720, 674)
y_train.shape = (135720,)
X_test.shape = (50700, 674)
y_test.shape = (50700,)


In [14]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna
from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    # Define the hyperparameter space for the model
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits = 2)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        y_pred = pipeline.predict(X_val_)
        score = mean_absolute_error(y_val_, y_pred)
        scores.append(score)

    return np.array(scores).mean()

In [16]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 1)

[I 2025-01-18 21:29:39,064] A new study created in memory with name: no-name-e17822c4-ced0-4b5b-b612-0562e510a541
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

In [17]:
best_params = study.best_trial.params

print(f'{best_params = }')

best_params = {'num_leaves': 47, 'feature_fraction': 0.9528745323326657, 'bagging_fraction': 0.96864818553338, 'min_child_samples': 62}


In [18]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)



In [19]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'Test MAE: {test_mae:.2f}')

Test MAE: 3.02




In [20]:
import joblib
from src.paths import MODEL_DIR

joblib.dump(pipeline, MODEL_DIR / 'model.pkl')

['C:\\Users\\LENOVO\\Desktop\\taxi_demand_predictor\\models\\model.pkl']

In [21]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema = input_schema, output_schema = output_schema)

In [23]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name = 'taxi_demand_model',
    metrics = {"test_mae": test_mae},
    description = "LightGBM Regressor",
    input_example = X_train.sample(),
    model_schema = model_schema
)
model.save(str(MODEL_DIR / 'model.pkl'))


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/484062 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3927 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/60849 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1207467/models/taxi_demand_model/1


Model(name: 'taxi_demand_model', version: 1)