In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import logging
import os
import sys
from datetime import datetime, timedelta, timezone
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import joblib
from hsml.model_schema import ModelSchema
from hsml.schema import Schema
from sklearn.metrics import mean_absolute_error

import config.config as config
from src.data_fetching_and_processing.transform_ts_data_to_features_and_target import transform_ts_data_info_features_and_target
from src.inference.fetch_days_data import fetch_days_data
from src.utils.inference_utils.inference_utils import (
    get_hopsworks_project,
    load_metrics_from_registry,
    load_model_from_registry
)
from src.utils.pipeline_utils.pipeline_utils import get_pipeline


In [39]:
print(f"Fetching data from group store ...")
ts_data = fetch_days_data(180)
print(len(ts_data))
print(f"Transforming to ts_data ...")

Fetching data from group store ...
2023-09-03 05:47:46.043540+00:00 2025-03-01 05:47:46.043540+00:00
2025-03-01 00:47:46,060 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-01 00:47:46,074 INFO: Initializing external client
2025-03-01 00:47:46,074 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-01 00:47:46,921 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214648
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (8.62s) 
2392324
Transforming to ts_data ...


In [40]:
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,zone,rides
0,2025-02-26 05:00:00+00:00,131,"Jamaica Estates, Queens",0
1,2025-02-08 13:00:00+00:00,33,"Brooklyn Heights, Brooklyn",0
2,2025-02-06 20:00:00+00:00,226,"Sunnyside, Queens",0
3,2025-02-03 12:00:00+00:00,242,"Van Nest/Morris Park, Bronx",0
4,2025-02-11 02:00:00+00:00,225,"Stuyvesant Heights, Brooklyn",0


In [41]:
features, targets = transform_ts_data_info_features_and_target(
    ts_data, window_size=24 * 28, step_size=23
)
pipeline = get_pipeline()
print(f"Training model ...")

pipeline.fit(features, targets)

predictions = pipeline.predict(features)

Training model ...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 163395
[LightGBM] [Info] Number of data points in the train set: 96524, number of used features: 675
[LightGBM] [Info] Start training from score 1.108253


In [42]:
test_mae = mean_absolute_error(targets, predictions)
metric = load_metrics_from_registry()

print(f"The new MAE is {test_mae:.4f}")
print(f"The previous MAE is {metric['test_mae']:.4f}")

if test_mae < metric.get("test_mae"):
    print(f"Registering new model")
    model_path = config.MODELS_DIR / "lgb_model.pkl"
    joblib.dump(pipeline, model_path)

    input_schema = Schema(features)
    output_schema = Schema(targets)
    model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)
    project = get_hopsworks_project()
    model_registry = project.get_model_registry()

    model = model_registry.sklearn.create_model(
        name="taxi_demand_predictor_next_hour_v2",
        metrics={"test_mae": test_mae},
        input_example=features.sample(),
        model_schema=model_schema,
    )
    model.save(model_path)
else:
    print(f"Skipping model registration because new model is not better!")


2025-03-01 00:48:46,378 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-01 00:48:46,384 INFO: Initializing external client
2025-03-01 00:48:46,385 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-01 00:48:47,221 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214648
The new MAE is 1.7168
The previous MAE is 0.0125
Skipping model registration because new model is not better!
