In [1]:
import sys
import os

# Allow notebook to find `src/` folder for imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [2]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as c
print("Prediction FG version:", c.FEATURE_GROUP_MODEL_PREDICTION_VERSION)


Prediction FG version: 2


In [3]:
from datetime import datetime, timedelta
import pandas as pd
import logging, sys
import hopsworks

from src.data_utils import (
    load_and_process_citibike_data_from_local,
    transform_raw_data_into_ts_data,
)
import src.config as config
from hsfs.feature import Feature


In [4]:
from src.data_utils import load_and_process_citibike_data_from_local

In [5]:
# at the top of your notebook
from src.data_utils import load_and_process_citibike_data_from_local


In [6]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(levelname)s  %(message)s", handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)

logger.info("üìÖ Loading Citi Bike data from Jan 2023 to Mar 2025 ‚Ä¶")

raw_rides_2023 = load_and_process_citibike_data_from_local(
    year=2023,
    months=list(range(1, 13)),
    base_path=config.LOCAL_CITIBIKE_DATA_PATH
)

raw_rides = pd.concat([raw_rides_2023])
logger.info(f"‚úÖ Total rows loaded: {len(raw_rides):,}")

print(raw_rides["pickup_location_id"].unique())

print(raw_rides.head())

2025-05-10 23:02:13,098 INFO: üìÖ Loading Citi Bike data from Jan 2023 to Mar 2025 ‚Ä¶














































































2025-05-10 23:10:09,059 INFO: ‚úÖ Total rows loaded: 410,344
['5329.03' '6140.05' '6948.10']
            ride_id  rideable_type         pickup_datetime  \
0  A46D077151843D7B   classic_bike 2023-01-16 10:39:54.386   
1  233875BAED2E02D0   classic_bike 2023-01-12 16:55:30.755   
2  8DD222EA1A1B0BC9  electric_bike 2023-01-08 19:32:25.647   
3  58976A4F584F8D28   classic_bike 2023-01-27 20:01:52.897   
4  FDD4C1E89A26727C   classic_bike 2023-01-13 18:02:38.160   

                  ended_at     start_station_name pickup_location_id  \
0  2023-01-16 10:45:18.005  West St & Chambers St            5329.03   
1  2023-01-12 17:04:03.688  West St & Chambers St            5329.03   
2  2023-01-08 19:42:00.382  West St & Chambers St            5329.03   
3  2023-01-27 20:08:58.118  West St & Chambers St            5329.03   
4  2023-

In [7]:
TOP_3_IDS = {"6140.05", "6948.10", "5329.03"}

raw_rides_top3 = raw_rides[raw_rides["pickup_location_id"].astype(str).isin(TOP_3_IDS)].copy()
logger.info(f"üìâ Filtered for top 3 stations: {len(raw_rides_top3):,} rows")


2025-05-10 23:10:09,701 INFO: üìâ Filtered for top 3 stations: 410,344 rows


In [8]:
raw_rides_top3.head()

Unnamed: 0,ride_id,rideable_type,pickup_datetime,ended_at,start_station_name,pickup_location_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file
0,A46D077151843D7B,classic_bike,2023-01-16 10:39:54.386,2023-01-16 10:45:18.005,West St & Chambers St,5329.03,West Thames St,5114.06,40.717548,-74.013221,40.708347,-74.017134,member,202301-citibike-tripdata_1.csv
1,233875BAED2E02D0,classic_bike,2023-01-12 16:55:30.755,2023-01-12 17:04:03.688,West St & Chambers St,5329.03,West Thames St,5114.06,40.717548,-74.013221,40.708347,-74.017134,member,202301-citibike-tripdata_1.csv
2,8DD222EA1A1B0BC9,electric_bike,2023-01-08 19:32:25.647,2023-01-08 19:42:00.382,West St & Chambers St,5329.03,West Thames St,5114.06,40.717618,-74.013071,40.708347,-74.017134,member,202301-citibike-tripdata_1.csv
3,58976A4F584F8D28,classic_bike,2023-01-27 20:01:52.897,2023-01-27 20:08:58.118,West St & Chambers St,5329.03,West Thames St,5114.06,40.717548,-74.013221,40.708347,-74.017134,member,202301-citibike-tripdata_1.csv
4,FDD4C1E89A26727C,classic_bike,2023-01-13 18:02:38.160,2023-01-13 18:11:22.139,West St & Chambers St,5329.03,West Thames St,5114.06,40.717548,-74.013221,40.708347,-74.017134,member,202301-citibike-tripdata_1.csv


In [9]:
logger.info("üßÆ Aggregating to hourly time-series format ‚Ä¶")
ts_data = transform_raw_data_into_ts_data(raw_rides_top3)
logger.info(f"‚úÖ Transformed data shape: {ts_data.shape}")
ts_data.head()


2025-05-10 23:10:09,815 INFO: üßÆ Aggregating to hourly time-series format ‚Ä¶
2025-05-10 23:10:10,333 INFO: ‚úÖ Transformed data shape: (26535, 3)


Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-12-28 11:00:00,5329.03,2
1,2022-12-28 12:00:00,5329.03,0
2,2022-12-28 13:00:00,5329.03,0
3,2022-12-28 14:00:00,5329.03,0
4,2022-12-28 15:00:00,5329.03,0


In [10]:
logger.info("üîê Logging in to Hopsworks ‚Ä¶")
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY,
)
fs = project.get_feature_store()


2025-05-10 23:10:10,365 INFO: üîê Logging in to Hopsworks ‚Ä¶
2025-05-10 23:10:10,395 INFO: Initializing external client
2025-05-10 23:10:10,397 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-05-10 23:10:13,071 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214674


In [11]:
from hsfs.feature import Feature

fg_schema = [
    Feature("pickup_hour",        "timestamp"),
    Feature("pickup_location_id", "string"),
    Feature("rides",              "int64"),
]

logger.info("üì¶ Writing to Hopsworks feature group ‚Ä¶")
hourly_fg = fs.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description="Hourly Citi Bike rides per location (2023‚Äì2025)",
    primary_key=["pickup_hour", "pickup_location_id"],
    event_time="pickup_hour",
    online_enabled=False,
    features=fg_schema,
)

ts_data["pickup_location_id"] = ts_data["pickup_location_id"].astype(str)
ts_data["rides"] = ts_data["rides"].astype("int64")

hourly_fg.insert(ts_data, write_options={"wait_for_job": True})
logger.info("‚úÖ Done uploading data to Hopsworks!")


2025-05-10 23:10:14,151 INFO: üì¶ Writing to Hopsworks feature group ‚Ä¶


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 26535/26535 | Elapsed Time: 00:04 | Remaining Time: 00:00


Launching job: citibike_hourly_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214674/jobs/named/citibike_hourly_features_1_offline_fg_materialization/executions
2025-05-10 23:10:31,621 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-05-10 23:10:34,777 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-05-10 23:12:13,045 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-05-10 23:12:13,173 INFO: Waiting for log aggregation to finish.
2025-05-10 23:12:34,905 INFO: Execution finished successfully.
2025-05-10 23:12:34,905 INFO: ‚úÖ Done uploading data to Hopsworks!


In [12]:
logger.info("üîé Creating Feature View ‚Ä¶")

from hsfs.feature import Feature

feature_group = fs.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

feature_view = fs.get_or_create_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION,
    description="Feature view for Citi Bike hourly demand",
    labels=[],
    query=feature_group.select_all()
)

logger.info("‚úÖ Feature View created successfully.")


2025-05-10 23:12:34,936 INFO: üîé Creating Feature View ‚Ä¶
2025-05-10 23:12:35,996 INFO: ‚úÖ Feature View created successfully.


In [13]:
from src.data_utils import transform_ts_data_info_features_and_target_loop

logger.info("üìä Generating sliding window features ‚Ä¶")

# Use correct window size
window_size = 672

# Get full transformed DataFrame with all metadata
features_df, _ = transform_ts_data_info_features_and_target_loop(
    ts_data, feature_col="rides", window_size=window_size, step_size=1
)

# Check shape and available columns
print("‚úÖ Transformed shape:", features_df.shape)
print("‚úÖ Sample columns:", features_df.columns[:5].tolist() + ["...", features_df.columns[-3], features_df.columns[-2], features_df.columns[-1]])


2025-05-10 23:12:36,013 INFO: üìä Generating sliding window features ‚Ä¶
‚úÖ Transformed shape: (24519, 675)
‚úÖ Sample columns: ['rides_t-672', 'rides_t-671', 'rides_t-670', 'rides_t-669', 'rides_t-668', '...', 'target', 'pickup_hour', 'pickup_location_id']


In [14]:
features_df.head()

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,target,pickup_hour,pickup_location_id
0,2,0,0,0,0,0,0,0,0,0,...,0,0,8,18,20,16,7,6,2023-01-25 11:00:00,5329.03
1,0,0,0,0,0,0,0,0,0,0,...,0,8,18,20,16,7,6,3,2023-01-25 12:00:00,5329.03
2,0,0,0,0,0,0,0,0,0,0,...,8,18,20,16,7,6,3,6,2023-01-25 13:00:00,5329.03
3,0,0,0,0,0,0,0,0,0,0,...,18,20,16,7,6,3,6,4,2023-01-25 14:00:00,5329.03
4,0,0,0,0,0,0,0,0,0,0,...,20,16,7,6,3,6,4,0,2023-01-25 15:00:00,5329.03


In [16]:
# 0Ô∏è‚É£ load the full Pipeline
import os, sys, joblib

model_path = os.path.abspath(
    os.path.join(os.getcwd(), "..", "models", "xgb_model.pkl")
)
lgb_pipeline = joblib.load(model_path)

# 1Ô∏è‚É£ pull out the underlying Booster and its feature‚Äêname list
#   (depends on whether you have a plain LGBMModel or a sklearn Pipeline)
if hasattr(lgb_pipeline, "booster_"):
    booster = lgb_pipeline.booster_
elif hasattr(lgb_pipeline, "_Booster"):
    booster = lgb_pipeline._Booster
else:
    # assume a Pipeline and last step is the LGB
    booster = lgb_pipeline.steps[-1][1].booster_

expected = booster.feature_name()
print("‚ûú Trained model expected", len(expected), "features")
print("First 5:", expected[:5])
print("Last 5: ", expected[-5:])

# 2Ô∏è‚É£ build your X exactly as you have it now
lag_cols   = [c for c in features_df.columns if c.startswith("rides_t-")]
input_cols = lag_cols + ["pickup_hour", "pickup_location_id"]
X = features_df[input_cols]

print("‚ûú Your X has", X.shape[1], "columns.")
print("First 5:", X.columns[:5].tolist())
print("Last 5: ", X.columns[-5:].tolist())

# 3Ô∏è‚É£ see which names don‚Äôt line up
missing_in_X   = set(expected) - set(X.columns)
extra_in_X     = set(X.columns) - set(expected)
print("‚úñ Features expected by model but missing in X:", sorted(missing_in_X))
print("‚úñ Extra columns in X that the model didn‚Äôt expect:", sorted(extra_in_X))


AttributeError: 'Booster' object has no attribute 'feature_name'

In [None]:
print("üîç Resolved path:", model_path)
print("üìÇ File exists:", os.path.exists(model_path))

In [None]:
import os, sys, joblib
import pandas as pd

# 1Ô∏è‚É£ Locate & load your full pipeline (with featurizer + LGB):
model_path = os.path.abspath(
    os.path.join(os.getcwd(), "..", "models", "lgb_model.pkl")
)
pipeline = joblib.load(model_path)

# 2Ô∏è‚É£ Build the raw X exactly as the pipeline saw it at train time:
lag_cols = [c for c in features_df.columns if c.startswith("rides_t-")]

X_raw = features_df[ 
    lag_cols
    + ["pickup_hour", "pickup_location_id"]
].copy()

# 3Ô∏è‚É£ Now run it:
preds = pipeline.predict(X_raw)

# 4Ô∏è‚É£ Stick them back:
features_df = features_df.copy()
features_df["predicted_rides"] = preds

features_df[["pickup_hour", "pickup_location_id", "predicted_rides"]].head()


In [None]:
import os, sys, joblib
import pandas as pd

# 0Ô∏è‚É£ If you haven‚Äôt already, make sure your notebook can see ../models
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

# 1Ô∏è‚É£ Load the full sklearn Pipeline you trained
model_path = os.path.join(os.getcwd(), "..", "models", "lgb_model.pkl")
assert os.path.exists(model_path), f"Model not found: {model_path}"
pipeline = joblib.load(model_path)

# 2Ô∏è‚É£ Build the raw X exactly as your pipeline saw it at train time:
#    - ALL lag features ("rides_t-...") 
#    - the placeholder "target" column (pipeline will drop it internally)
#    - pickup_hour (for hour/day features)
#    - pickup_location_id (so the pipeline‚Äôs drop-step can find it)
lag_cols = [col for col in features_df.columns if col.startswith("rides_t-")]
X_raw   = features_df[lag_cols + ["pickup_hour", "pickup_location_id"]].copy()

# 3Ô∏è‚É£ Run the pipeline‚Äôs predict (it will extend with avg/day/hour, drop extras, then LGBM)
preds = pipeline.predict(X_raw)

# 4Ô∏è‚É£ Round to int32 (since your FG schema is `int`)
features_df = features_df.copy()  # avoid any warning
features_df["predicted_rides"] = preds.round(0).astype("int32")

# 5Ô∏è‚É£ Quick sanity check
features_df[["pickup_hour", "pickup_location_id", "predicted_rides"]].head()


In [None]:
features_df.shape

In [None]:
# quick sanity‚Äëcheck
print(features_df[["pickup_hour", "pickup_location_id", "predicted_rides"]].head())


In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as c
print("Prediction FG version:", c.FEATURE_GROUP_MODEL_PREDICTION_VERSION)


In [None]:
import hopsworks
from hsfs.feature import Feature
import src.config as c

# 1Ô∏è‚É£ Log in
project = hopsworks.login(
    project       = c.HOPSWORKS_PROJECT_NAME,
    api_key_value = c.HOPSWORKS_API_KEY,
)
fs = project.get_feature_store()

# 2Ô∏è‚É£ Get-or-create the prediction Feature Group v2
pred_fg = fs.get_or_create_feature_group(
    name          = c.FEATURE_GROUP_MODEL_PREDICTION,
    version       = c.FEATURE_GROUP_MODEL_PREDICTION_VERSION,
    description   = "Next-hour demand predictions from LGBM model (v2)",
    primary_key   = ["pickup_location_id", "pickup_hour"],
    event_time    = "pickup_hour",
    online_enabled=False,
    features      = [
        Feature("pickup_location_id", "string"),
        Feature("pickup_hour",        "timestamp"),
        Feature("predicted_rides",    "int"),
    ],
)
print(f"‚úÖ Using Feature Group ‚Äú{pred_fg.name}‚Äù v{pred_fg.version} (id {pred_fg.id})")

# 3Ô∏è‚É£ Prepare your output DataFrame exactly to schema
out_df = features_df[["pickup_hour", "pickup_location_id", "predicted_rides"]].copy()
out_df["pickup_location_id"] = out_df["pickup_location_id"].astype(str)
out_df["predicted_rides"]    = out_df["predicted_rides"].round(0).astype("int32")

# 4Ô∏è‚É£ Insert & wait for completion
pred_fg.insert(out_df, write_options={"wait_for_job": True})
print("üöÄ  Predictions uploaded to Hopsworks!")
