In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.config as config

In [3]:
import hopsworks

project = hopsworks.login(
    project = config.HOPSWORK_PROJECT_NAME,
    api_key_value = config.HOPSWORK_API_KEY
)

feature_store = project.get_feature_store()

feature_group = feature_store.get_or_create_feature_group(
    name = config.FEATURE_GROUP_NAME,
    version = config.FEATURE_GROUP_VERSION,
)

2025-01-05 10:06:07,071 INFO: Initializing external client
2025-01-05 10:06:07,071 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 10:06:09,469 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1207467


In [4]:
try:
    feature_store.create_feature_view(
        name = config.FEATURE_VIEW_NAME,
        version = config.FEATURE_VIEW_VERSION,
        query = feature_group.select_all()
    )

except:
    feature_view = feature_store.get_feature_view(
        name = config.FEATURE_VIEW_NAME,
        version = config.FEATURE_VIEW_VERSION
    )

In [5]:
ts_data = feature_view.training_data(
    description = "Time-series hourly taxi rides"
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (19.75s) 




In [6]:
ts_data = ts_data[0]

In [7]:
ts_data["pickup_hour"].dtype

dtype('O')

In [8]:
ts_data.sort_values(by = ["pickup_location_id", "pickup_hour"], inplace = True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
4791724,2022-01-01 00:00:00+00:00,0.0,1
1836472,2022-01-01 01:00:00+00:00,0.0,1
4183649,2022-01-01 02:00:00+00:00,0.0,1
4662493,2022-01-01 03:00:00+00:00,0.0,1
3901370,2022-01-01 04:00:00+00:00,1.0,1
...,...,...,...
3854289,2025-01-05 00:00:00+00:00,3.0,265
3854379,2025-01-05 01:00:00+00:00,5.0,265
3854479,2025-01-05 02:00:00+00:00,5.0,265
3855139,2025-01-05 03:00:00+00:00,3.0,265


In [9]:
from src.data import transform_ts_data_into_features_and_target

features, target = transform_ts_data_into_features_and_target(ts_data,
                                                              n_sez_len = 24*28,
                                                              step_size = 23)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = target

print(f'{features_and_target.shape = }')

100%|██████████| 263/263 [03:06<00:00,  1.41it/s]


features_and_target.shape = (286548, 675)


In [10]:
from datetime import date, timedelta
from pytz  import timezone
import pandas as pd
from src.data_split import train_test_split

cut_off_date = pd.to_datetime(date.today() - timedelta(days = 28*1))

print(f'{cut_off_date = }')

# Convert pickup_hour to UTC if needed
features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])

features_and_target['pickup_hour'] = features_and_target['pickup_hour'].dt.tz_convert('UTC')

# Localize cut_off_date to UTC
cut_off_date = cut_off_date.tz_localize('UTC')


X_train, y_train, X_test, y_test = train_test_split(features_and_target, 
                                                    cut_off_date, 
                                                    target_column_name = 'target_rides_next_hour'
                                                    )
print(f'{X_train.shape = }')
print(f'{y_train.shape = }')
print(f'{X_test.shape = }')
print(f'{y_test.shape = }')

cut_off_date = Timestamp('2024-12-08 00:00:00')
X_train.shape = (279007, 674)
y_train.shape = (279007,)
X_test.shape = (7541, 674)
y_test.shape = (7541,)


In [11]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna
from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    # Define the hyperparameter space for the model
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits = 2)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        y_pred = pipeline.predict(X_val_)
        score = mean_absolute_error(y_val_, y_pred)
        scores.append(score)

    return np.array(scores).mean()

In [12]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 1)

[I 2025-01-05 10:11:33,228] A new study created in memory with name: no-name-b95fe3bf-e2d3-44cb-ba5e-9ce40bea8fff
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

In [13]:
best_params = study.best_trial.params

print(f'{best_params = }')

best_params = {'num_leaves': 205, 'feature_fraction': 0.9353887105349166, 'bagging_fraction': 0.881275051752695, 'min_child_samples': 12}


In [14]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)



In [15]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'Test MAE: {test_mae:.2f}')

Test MAE: 2.94




In [16]:
import joblib
from src.paths import MODEL_DIR

joblib.dump(pipeline, MODEL_DIR / 'model.pkl')

['C:\\Users\\LENOVO\\Desktop\\taxi_demand_predictor\\models\\model.pkl']

In [17]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema = input_schema, output_schema = output_schema)

In [18]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name = 'taxi_demand_model',
    metrics = {"test_mae": test_mae},
    description = "LightGBM Regressor",
    input_example = X_train.sample(),
    model_schema = model_schema
)
model.save(str(MODEL_DIR / 'model.pkl'))


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1894359 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3399 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/60849 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1207467/models/taxi_demand_model/2


Model(name: 'taxi_demand_model', version: 2)