In [6]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [7]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [8]:
from dotenv import load_dotenv
import os

# Load variables from .env into environment
load_dotenv()

# Now you can access them normally
project = os.getenv("HOPSWORKS_PROJECT_NAME")
api_key = os.getenv("HOPSWORKS_API_KEY")


In [9]:
import hopsworks

conn = hopsworks.login(
    project=project,
    api_key_value=api_key
)


2025-05-10 08:48:30,942 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 08:48:30,954 INFO: Initializing external client
2025-05-10 08:48:30,956 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-05-10 08:48:31,492 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214684


In [10]:
import os
import pandas as pd
import hopsworks

# --- Config ------------------------------------------------------------------
PARQUET_PATH = "../data/processed/2023/citibike_hourly_features_top3.parquet"
FG_NAME      = "bike_hourly_fg"
FG_VERSION   = 1
EVENT_TIME   = "start_hour"          # column already present in the parquet
PRIMARY_KEY  = ["start_station_id"]  # unique per row when combined with EVENT_TIME
ONLINE       = True                  # we want online look‑ups for inference

# --- 1. Connect --------------------------------------------------------------
project = hopsworks.login(
    project=os.environ["HOPSWORKS_PROJECT_NAME"],          # reads env var
    api_key_value=os.environ["HOPSWORKS_API_KEY"]                # reads env var
)
fs = project.get_feature_store()

# --- 2. Load the dataframe ---------------------------------------------------
df = pd.read_parquet(PARQUET_PATH)

# --- 3. Create or fetch the feature group -----------------------------------
try:
    fg = fs.get_feature_group(name=FG_NAME, version=FG_VERSION)
    print(f"Feature-Group {FG_NAME} v{FG_VERSION} already exists — reusing it.")
except hopsworks.client.exceptions.RestAPIError:
    fg = fs.create_feature_group(
        name         = FG_NAME,
        version      = FG_VERSION,
        description  = "Hourly Citi Bike features for the three busiest stations (2023)",
        primary_key  = PRIMARY_KEY,
        event_time   = EVENT_TIME,
        online_enabled = ONLINE,
        statistics_config = {"enabled": True}   # auto‑profiling
    )
    print(f"Created Feature-Group {FG_NAME} v{FG_VERSION}.")

# --- 4. Insert data ----------------------------------------------------------
fg.insert(df, write_options={"wait_for_job": True})
print("Ingestion finished")

# --- 5. (Optional) create a feature view ------------------------------------
try:
    fv = fs.get_feature_view(name="bike_hourly_fv", version=1)
except hopsworks.client.exceptions.RestAPIError:
    fv = fs.create_feature_view(
        name        ="bike_hourly_fv",
        version     =1,
        description ="All engineered features for top-3 stations (2023)",
        query       =fg.select_all()
    )
print("Feature View ready.")


2025-05-10 08:48:31,931 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 08:48:31,934 INFO: Initializing external client
2025-05-10 08:48:31,934 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-05-10 08:48:32,543 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214684
Feature-Group bike_hourly_fg v1 already exists — reusing it.


Uploading Dataframe: 100.00% |██████████| Rows 26280/26280 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: bike_hourly_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214684/jobs/named/bike_hourly_fg_1_offline_fg_materialization/executions
2025-05-10 08:49:06,328 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2025-05-10 08:49:09,428 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-05-10 08:49:12,528 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-05-10 08:51:18,709 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2025-05-10 08:51:21,826 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-05-10 08:51:21,909 INFO: Waiting for log aggregation to finish.
2025-05-10 08:51:36,721 INFO: Execution finished successfully.
Ingestion finished
Feature View ready.


# Loading lag 672 to hopsworks feature group

In [11]:
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import hopsworks

# --- Load environment variables ----------------------------------------------
load_dotenv()  # loads HOPSWORKS_PROJECT_NAME and HOPSWORKS_API_KEY

# --- Paths and config --------------------------------------------------------
PARQUET_PATH = Path("../data/processed/2023/citibike_lag672_top3.parquet")
FG_NAME      = "bike_lag672_top3_fg"
FG_VERSION   = 1
EVENT_TIME   = "start_hour"
PRIMARY_KEY  = ["start_station_id"]
ONLINE       = False  # offline only, used for training

# --- Connect to Hopsworks ----------------------------------------------------
project = hopsworks.login(
    project=os.environ["HOPSWORKS_PROJECT_NAME"],
    api_key_value=os.environ["HOPSWORKS_API_KEY"]
)
fs = project.get_feature_store()

# --- Load data ---------------------------------------------------------------
df = pd.read_parquet(PARQUET_PATH)

# --- Create or reuse Feature Group -------------------------------------------
try:
    fg = fs.get_feature_group(FG_NAME, version=FG_VERSION)
    print(f"Feature Group '{FG_NAME}' already exists — reusing.")
except hopsworks.client.exceptions.RestAPIError:
    fg = fs.create_feature_group(
        name=FG_NAME,
        version=FG_VERSION,
        description="Lag_1 to lag_672 features for top 3 busiest Citi Bike stations",
        primary_key=PRIMARY_KEY,
        event_time=EVENT_TIME,
        online_enabled=ONLINE,
        statistics_config={"enabled": True}
    )
    print(f"Created new Feature Group: '{FG_NAME}' v{FG_VERSION}")

# --- Insert data into Feature Store ------------------------------------------
fg.insert(df, write_options={"wait_for_job": True})
print("Data insertion to Hopsworks complete.")

# --- Done --------------------------------------------------------------------
print(f"Feature Group '{FG_NAME}' v{FG_VERSION} now available in Hopsworks.")


2025-05-10 08:51:37,512 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 08:51:37,516 INFO: Initializing external client
2025-05-10 08:51:37,516 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 08:51:38,117 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214684
Feature Group 'bike_lag672_top3_fg' already exists — reusing.


Uploading Dataframe: 100.00% |██████████| Rows 24264/24264 | Elapsed Time: 00:53 | Remaining Time: 00:00


Launching job: bike_lag672_top3_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214684/jobs/named/bike_lag672_top3_fg_1_offline_fg_materialization/executions
2025-05-10 08:52:44,729 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-05-10 08:52:47,814 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-05-10 08:55:17,714 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2025-05-10 08:55:20,801 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-05-10 08:55:20,875 INFO: Waiting for log aggregation to finish.
2025-05-10 08:55:44,991 INFO: Execution finished successfully.
Data insertion to Hopsworks complete.
Feature Group 'bike_lag672_top3_fg' v1 now available in Hopsworks.


In [16]:
fg = fs.get_feature_group(name="bike_lag672_top3_fg", version=1)
fg.materialization_job.run()


Launching job: bike_lag672_top3_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214684/jobs/named/bike_lag672_top3_fg_1_offline_fg_materialization/executions
2025-05-10 09:00:06,491 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-05-10 09:00:09,590 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-05-10 09:01:36,925 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-05-10 09:01:36,998 INFO: Waiting for log aggregation to finish.
2025-05-10 09:01:48,514 INFO: Execution finished successfully.


Execution('SUCCEEDED', 'FINISHED', '2025-05-10T12:59:45.000Z', '-op offline_fg_materialization -path hdfs:///Projects/sp25_taxi_vaibhav/Resources/jobs/bike_lag672_top3_fg_1_offline_fg_materialization/config_1746868049058')

In [17]:
# Verify Features in Hopsworks
import hopsworks
import pandas as pd

# Login using environment variables (.env should be configured already)
project = hopsworks.login()
fs = project.get_feature_store()

# Define expected FG and features
fg_name = 'bike_lag672_top3_fg'
fg_version = 1
expected_lags = [f'lag_{i}' for i in range(1, 673)]
required_cols = [
    'start_station_id', 'start_hour', 'target_t_plus_1',
    'rides', 'hour', 'dow', 'doy',
    'sin_hour', 'cos_hour', 'sin_dow', 'cos_dow',
    'is_weekend', 'is_holiday', 'rollmean_24', 'rollmean_168'
] + expected_lags

# Load the feature group
fg = fs.get_feature_group(name=fg_name, version=fg_version)
df = fg.read()

# Check shape and column match
print(f"DataFrame shape: {df.shape}")
missing_cols = set(required_cols) - set(df.columns)
if missing_cols:
    print(f"Missing columns: {sorted(missing_cols)}")
else:
    print("All expected columns present, including all 672 lag features.")

# Check for missing values
nulls = df[required_cols].isna().sum()
print("Top columns with missing values:")
print(nulls[nulls > 0].sort_values(ascending=False).head(10))


2025-05-10 09:04:16,674 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 09:04:16,681 INFO: Initializing external client
2025-05-10 09:04:16,681 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-05-10 09:04:17,392 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214684
2025-05-10 09:04:21,081 ERROR: No data found for featuregroup sp25_taxi_vaibhav.bike_lag672_top3_fg_1. Detail: Python exception: FlyingDuckException. gRPC client debug context: UNKNOWN:Error received from peer ipv4:51.79.26.27:5005 {grpc_message:"No data found for featuregroup sp25_taxi_vaibhav.bike_lag672_top3_fg_1. Detail: Python exception: FlyingDuckException", grpc_status:2, created_time:"2025-05-10T09:04:21.081128-04:00"}. Client context: IOError: Server never sent a data message. Detail: Internal
Traceback (most recent call last):
  File "/opt/anaconda3/envs/citibike/lib/python3.12/site-packages/hsfs/core/arrow_flight_client.py", line 364, in afs_error_handler_wrapper
    self._certificates_json = json.dumps(self._certificates()).encode("utf-8")
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/citibike/lib/python3.12/si

FeatureStoreException: Could not read data using Hopsworks Feature Query Service.