# Snowflake Online Feature Store:  Predicting Taxi Trip Durations

This notebook demonstrates how to use the Snowflake Online Feature Store and Snowpark ML to build features, register feature views, and train a model to predict taxi trip durations in New York City. It covers entity and feature view registration, feature engineering, online and offline store usage, and real-time inference.


In [None]:
import pandas as pd
from snowflake.ml.feature_store import CreationMode, FeatureStore, feature_view
from snowflake.ml.feature_store.entity import Entity
from snowflake.snowpark import Session
from snowflake.snowpark.exceptions import SnowparkSQLException
from snowflake.snowpark.functions import avg, col, datediff, dayofweek, hour, month, nullifzero, when
from snowflake.snowpark.types import DecimalType
pd.set_option('display.max_columns', None)

In [None]:
# Snowflake connection parameters
connection_parameters = {
    "account": "<account_name>",
    "user": "<username>",
    "password": "<programmatic_access_token>",
    "role": "<role>",
    "host": "<host - if using private link>",
    "warehouse": "<warehouse>",
    "database": "<database>",
    "schema": "<schema>"
}
session = Session.builder.configs(connection_parameters).create()

# If you are in an interactive Notebook session, you can instead get the active session with:
# from snowflake.snowpark.context import get_active_session
# session = get_active_session()

TAXI_DB = session.get_current_database()
TAXI_SCHEMA = session.get_current_schema()
TAXI_TABLE = "NYC_YELLOW_TRIPS"
TAXI_TABLE_FULL_NAME = f"{TAXI_DB}.{TAXI_SCHEMA}.{TAXI_TABLE}"

## Data Setup
Load a sample dataset of NYC taxi trips. The data includes pickup and dropoff location IDs, timestamps, fare amounts, and more.

In [None]:
# One-time setup: load example data
try:
    session.table(TAXI_TABLE_FULL_NAME).limit(0).collect()
    print("NYC taxi table already exists")
except SnowparkSQLException as e:
    print("Loading NYC taxi table")
    from snowflake.ml.feature_store.examples.example_helper import ExampleHelper
    example_helper = ExampleHelper(session, session.get_current_database(), session.get_current_schema())
    source_tables = example_helper.load_example('new_york_taxi_features')
    for table in source_tables:
        print(f"{table}:")
        df = session.table(table).limit(5).to_pandas()
        print(df)

## Feature Engineering with Snowflake Feature Store

In [None]:
# Set up Feature Store and context
fs = FeatureStore(
    session=session,
    database=TAXI_DB,
    name=TAXI_SCHEMA,
    default_warehouse=session.get_current_warehouse(),
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST,
)

## Register Entities
Entities represent the keys used to join features. Here we define entities for location, trip, and route.


In [None]:
# Define and register entities
route_entity = Entity(
    name="route",
    join_keys=["PULOCATIONID", "DOLOCATIONID"],
    desc="A taxi route defined by pickup and dropoff location IDs."
)
pickup_time_entity = Entity(
    name="pickup_time",
    join_keys=["PICKUP_HOUR", "PICKUP_DAY_OF_WEEK"],
    desc="Pickup time bucketed by hour and day of week"
)

fs.register_entity(route_entity)
fs.register_entity(pickup_time_entity)
print("List Entities:")
fs.list_entities().show()

## Load Data and Create Features
Read the NYC taxi trip data and engineer features for ETA, speed, rush hour, and more.


In [None]:
# Load data
df = session.table(TAXI_TABLE_FULL_NAME)

# Create features
df = df.with_column("TRIP_DISTANCE_INT", col("TRIP_DISTANCE").cast(DecimalType(10, 0)))
df = df.with_column("ETA_MINUTES", (datediff("second", col("TPEP_PICKUP_DATETIME"), col("TPEP_DROPOFF_DATETIME")) / 60.0))
df = df.with_column("ETA_MINUTES_INT", col("ETA_MINUTES").cast(DecimalType(10, 0)))
df = df.with_column("PICKUP_HOUR", hour(col("TPEP_PICKUP_DATETIME")))
df = df.with_column("PICKUP_DAY_OF_WEEK", dayofweek(col("TPEP_PICKUP_DATETIME")))  # 0=Sunday, 6=Saturday
df = df.with_column("IS_WEEKEND", when(col("PICKUP_DAY_OF_WEEK").isin([0, 6]), 1).otherwise(0))
df = df.with_column("PICKUP_MONTH", month(col("TPEP_PICKUP_DATETIME")))
df = df.with_column("SPEED_MPH", df["TRIP_DISTANCE"] / nullifzero((df["ETA_MINUTES"] / 60)))
df = df.with_column("SPEED_MPH_INT", col("SPEED_MPH").cast(DecimalType(10, 0)))
df = df.with_column("IS_RUSH_HOUR", when((col("PICKUP_HOUR").isin([7, 8, 9, 16, 17, 18, 19])),1).otherwise(0))
df = df.with_column("IS_LOOP_ROUTE", when(col("PULOCATIONID") == col("DOLOCATIONID"), 1).otherwise(0))
print("Trip Data with Basic Features:")
df.show(n=10)

In [None]:
# Time-series rolling aggregations for route using Snowpark analytics.time_series_agg
def _ts_col_namer(input_col, agg, window):
    if agg == "AVG" and input_col == "ETA_MINUTES_INT":
        return "AVG_ETA_ROUTE"
    if agg == "AVG" and input_col == "SPEED_MPH_INT":
        return "AVG_SPEED_ROUTE"
    if agg == "AVG" and input_col == "TRIP_DISTANCE_INT":
        return "AVG_DISTANCE_ROUTE"
    return f"{agg}_{input_col}_{window.replace('-', 'past_')}"

# Averaging on int columns allows the table to refresh incrementally
df = df.analytics.time_series_agg(
    aggs={
        "ETA_MINUTES_INT": ["AVG"],
        "SPEED_MPH_INT": ["AVG"],
        "TRIP_DISTANCE_INT": ["AVG"],
    },
    windows=["-30D"],
    group_by=["PULOCATIONID", "DOLOCATIONID"],
    time_col="TPEP_PICKUP_DATETIME",
    col_formatter=_ts_col_namer,
)

print("Trip Data with Aggregated Features (time-series):")
df.show()

In [None]:
# Select features for the feature view
feature_df = df.select(
    # Columns direct from table
    "PULOCATIONID", "DOLOCATIONID", "TRIP_DISTANCE",
    "VENDORID", "FARE_AMOUNT", "TOTAL_AMOUNT", "TPEP_PICKUP_DATETIME",
    "TIP_AMOUNT", "TOLLS_AMOUNT",
    # Feature columns created from table (exclude final label ETA_MINUTES)
    "PICKUP_HOUR", "PICKUP_DAY_OF_WEEK", "IS_WEEKEND", "PICKUP_MONTH",
    "SPEED_MPH", "IS_RUSH_HOUR", "IS_LOOP_ROUTE",
    # Feature columns created from aggregating by column combinations
    "AVG_ETA_ROUTE", "AVG_DISTANCE_ROUTE", "AVG_SPEED_ROUTE"
)

## Register Feature View
Create and register a feature view for trip-based features, enabling online serving.


In [None]:
# Define and register the feature view
route_fv = feature_view.FeatureView(
    name="nyc_taxi_trip_fv",
    entities=[route_entity, pickup_time_entity],
    feature_df=feature_df,
    timestamp_col="TPEP_PICKUP_DATETIME",
    refresh_freq="60s", # Dynamic Table refresh minimum
    desc="Trip-based features for taxi ETA prediction",
    online_config=feature_view.OnlineConfig(enable=True, target_lag="10s"),
)

registered_route_fv = fs.register_feature_view(route_fv, "v1", overwrite=True)

print("Registered feature view:", registered_route_fv.name, registered_route_fv.version)
print("Online feature table:", registered_route_fv.fully_qualified_online_table_name())

In [None]:
# Check refresh history
fs.get_refresh_history(registered_route_fv, store_type=feature_view.StoreType.ONLINE).show()

In [None]:
# Fetch features from the online store
online_df = fs.read_feature_view(
    registered_route_fv,
    store_type=feature_view.StoreType.ONLINE,
)
online_df.show()

In [None]:
# Explore online tables
fs.list_feature_views().show()

## Train/Test Split
Split the data into training and test sets for ML model development.


In [None]:
# Create train/test split for ML
spine_cols = [
    "ETA_MINUTES",           # label
    "PULOCATIONID",          # join key
    "DOLOCATIONID",          # join key
    "PICKUP_HOUR",           # join key
    "PICKUP_DAY_OF_WEEK",    # join key
    "TPEP_PICKUP_DATETIME"   # for point-in-time correctness (as spine_timestamp_col)
]
train_spine_df, test_spine_df = df.select(spine_cols).random_split([0.85, 0.15], seed=42)

In [None]:
# Generate datasets with features for training and testing
train_df = fs.generate_training_set(
    spine_df=train_spine_df,
    features=[registered_route_fv],
    spine_label_cols=["ETA_MINUTES"],  # Target column for regression
    save_as="TAXI_TRAIN_SET",
    spine_timestamp_col="TPEP_PICKUP_DATETIME"
)
test_df = fs.generate_training_set(
    spine_df=test_spine_df,
    features=[registered_route_fv],
    spine_label_cols=["ETA_MINUTES"],
    save_as="TAXI_TEST_SET",
    spine_timestamp_col="TPEP_PICKUP_DATETIME"
)

In [None]:
print("Train set sample:")
print(train_df.limit(5).to_pandas())
print("Test set sample:")
print(test_df.limit(5).to_pandas())
print(f"Train set count: {train_df.count()}")
print(f"Test set count: {test_df.count()}")

## Train XGBoost Model
Train an XGBoost regressor using Snowpark ML on the generated training set.


In [None]:
# Train XGBoost model using Snowpark ML
from snowflake.ml.modeling.xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define feature columns (exclude label and keys)
feature_columns = [
    col for col in train_df.columns
    if col not in ["ETA_MINUTES", "TPEP_PICKUP_DATETIME"]
]
label_column = "ETA_MINUTES"

regressor = XGBRegressor(
    input_cols=feature_columns,
    label_cols=[label_column],
    output_cols=["predicted_eta"]
)
regressor.fit(train_df)

In [None]:
# Predict on test set
predictions = regressor.predict(test_df)
predictions_pd = predictions.to_pandas()

In [None]:
# Evaluate model
mse = mean_squared_error(predictions_pd[label_column], predictions_pd["predicted_eta"])
r2 = r2_score(predictions_pd[label_column], predictions_pd["predicted_eta"])
print(f"Test MSE: {mse}")
print(f"Test R2: {r2}")

In [None]:
# Feature importances
xgb_native = regressor.to_xgboost()

if hasattr(xgb_native, "feature_importances_"):
    importances = xgb_native.feature_importances_
    importance_df = pd.DataFrame({
        'feature': feature_columns,
        'importance': importances
    }).sort_values(by='importance', ascending=False)
    print("Feature importances:")
    print(importance_df)

if hasattr(xgb_native, "get_booster"):
    booster = xgb_native.get_booster()

    weights = booster.get_score(importance_type='weight')
    weights_df = pd.DataFrame(list(weights.items()), columns=['feature', 'weight']).sort_values(by='weight', ascending=False)
    print("Booster weights:")
    print(weights_df)

    gains = booster.get_score(importance_type='gain')
    gains_df = pd.DataFrame(list(gains.items()), columns=['feature', 'gain']).sort_values(by='gain', ascending=False)
    print("Booster gains:")
    print(gains_df)

## Real-time Inference Example
Fetch the latest features from the online store and predict duration for a new trip.

In [None]:
def predict_trip_duration(pu_location_id, do_location_id, pickup_hour, pickup_day_of_week):
    trip = [
        [pu_location_id, do_location_id, pickup_hour, pickup_day_of_week],
    ]

    # Fetch latest features from the online store
    features_df = fs.read_feature_view(
        registered_route_fv,
        keys=trip,
        store_type=feature_view.StoreType.ONLINE
    )
    
    features_pd = features_df.to_pandas()
    if features_pd.empty:
        print("No online features found, skipping prediction")
        return None
    
    print("Online features:")
    print(features_pd)

    return regressor.predict(features_pd)
    

prediction = predict_trip_duration(141, 236, 8, 0)
if prediction is not None:
    print("Predicted Trip Duration (minutes):")
    print(prediction['predicted_eta'])

## New Data Arrives
Simulate a recently ended trip between the same pickup and dropoff location ids that also occurred on a Sunday between 8am and 9am. In this case, the trip takes longer than normal.

In [None]:
from datetime import datetime
from snowflake.snowpark.functions import max as sf_max, coalesce, lit

# Compute next TRIP_ID
next_trip_id = (
    session.table(TAXI_TABLE_FULL_NAME)
    .select(coalesce(sf_max("TRIP_ID"), lit(0)) + lit(1))
    .first()[0]
)

# Create single-row DataFrame for the new trip
new_trip_df = session.create_dataframe(
    [
        (
            1,                 # VENDORID
            1,                 # PASSENGER_COUNT
            3.8,               # TRIP_DISTANCE
            1,                 # RATECODEID
            "N",               # STORE_AND_FWD_FLAG
            141,               # PULOCATIONID
            236,               # DOLOCATIONID
            1,                 # PAYMENT_TYPE
            14.50,             # FARE_AMOUNT
            3.00,              # EXTRA
            0.50,              # MTA_TAX
            3.65,              # TIP_AMOUNT
            0.00,              # TOLLS_AMOUNT
            0.30,              # IMPROVEMENT_SURCHARGE
            29.95,             # TOTAL_AMOUNT
            4.5,               # CONGESTION_SURCHARGE
            0,                 # AIRPORT_FEE
            datetime(2025, 8, 31, 8, 55, 0),  # TPEP_PICKUP_DATETIME
            datetime(2025, 8, 31, 9, 15, 0),  # TPEP_DROPOFF_DATETIME
            next_trip_id,      # TRIP_ID
        )
    ],
    schema=[
        "VENDORID",
        "PASSENGER_COUNT",
        "TRIP_DISTANCE",
        "RATECODEID",
        "STORE_AND_FWD_FLAG",
        "PULOCATIONID",
        "DOLOCATIONID",
        "PAYMENT_TYPE",
        "FARE_AMOUNT",
        "EXTRA",
        "MTA_TAX",
        "TIP_AMOUNT",
        "TOLLS_AMOUNT",
        "IMPROVEMENT_SURCHARGE",
        "TOTAL_AMOUNT",
        "CONGESTION_SURCHARGE",
        "AIRPORT_FEE",
        "TPEP_PICKUP_DATETIME",
        "TPEP_DROPOFF_DATETIME",
        "TRIP_ID",
    ],
)

# Append to the existing table
new_trip_df.write.mode("append").save_as_table(TAXI_TABLE_FULL_NAME)
print("Inserted 1 new trip row into", TAXI_TABLE_FULL_NAME)

## Prediction Reflects Latest Data
After a maximum of 70 seconds (60 seconds of data lag for the Dynamic Table and 10 for the Online Feature Table), the latest data has been transformed into its features and is being used for inference. The predicted ETA has increased since the most recently completed trip took longer than normal.

In [None]:
prediction = predict_trip_duration(141, 236, 8, 0)
if prediction is not None:
    print("Predicted Trip Duration (minutes):")
    print(prediction['predicted_eta'])

# Read Feature View During Online Serving (optional)
## Save the Model
Log the trained model to Snowflake Model Registry

In [None]:
from snowflake.ml.registry import Registry

registry = Registry(session=session)
model_name = "NYC_TAXI_ETA_XGB"

mv = registry.log_model(
    model=regressor,
    model_name=model_name,
    comment="Predict NYC taxi trip durations (ETA) using Feature Store features",
    metrics={"test_mse": float(mse), "test_r2": float(r2)},
    version_name="v1",
)
print("Logged model version:", mv)
registry.show_models()

## Deploy the Model to Snowpark Container Services (SPCS)

Deploy the logged model version to SPCS. This builds a container with the model’s dependencies, creates a service in a compute pool, and exposes service functions for inference.

Prerequisites:
- Existing compute pool with USAGE (or OWNERSHIP) for your role.
- You can create one like this:
```sql
CREATE COMPUTE POOL IF NOT EXISTS trip_eta_prediction_pool
  MIN_NODES = 1
  MAX_NODES = 1
  INSTANCE_FAMILY = 'CPU_X64_M'
  AUTO_RESUME = TRUE;
GRANT USAGE ON COMPUTE POOL trip_eta_prediction_pool TO ROLE <your_role>;
GRANT BIND SERVICE ENDPOINT ON ACCOUNT TO ROLE <your_role>;
SHOW COMPUTE POOLS LIKE 'trip_eta_prediction_pool';
```

References:
- Snowflake docs: [Model Serving in SPCS](https://docs.snowflake.com/en/developer-guide/snowflake-ml/model-registry/container#label-model-registry-container-xgboost-cpu-inference)
- Quickstart: [Deploy custom models to Model Registry](https://quickstarts.snowflake.com/guide/deploying_custom_models_to_snowflake_model_registry/index.html?index=..%2F..index#0)
- Quickstart: [Snowpark Container Services Model Serving Guide](https://quickstarts.snowflake.com/guide/snowpark-container-services-model-serving-guide/index.html?index=..%2F..index#3)


In [None]:
latest_model = registry.get_model(model_name).version("v1")
print("Deploying model:", latest_model)

service_name = "NYC_TAXI_ETA_V1"
latest_model.create_service(
    service_name=service_name,
    service_compute_pool="trip_eta_prediction_pool",
    ingress_enabled=True  # expose the service
)

## Use the Model in SPCS

In [None]:
# Direct prediction from the model prediction service in Python

# Check estimated duration from pickup location 141 to dropoff location 236 at 8am on a Sunday
trip = [[141, 236, 8, 0]]

features_df = fs.read_feature_view(
    registered_route_fv,
    keys=trip,
    store_type=feature_view.StoreType.ONLINE
)

features_pd = features_df.to_pandas()
if features_pd.empty:
    print("No online features found, skipping prediction")
else:
    print("Online features:")
    print(features_pd)

    spcs_prediction = latest_model.run(features_pd, function_name='predict', service_name=service_name)
    print("Predicted Trip Duration (minutes) from SPCS:")
    print(spcs_prediction['\"predicted_eta\"'])

In [None]:
# Get the exposed service URL
services_df = latest_model.list_services()
# You can also view the inference endpoint via SQL: "show endpoints in service {service_name}"
base = str(services_df["inference_endpoint"].iloc[0])
URL = "https://" + base + "/predict"
print("Service predict URL:", URL)

### Use the ingress endpoint to get the service prediction via HTTP

This example uses session token authorization for API access. Most services will prefer key-pair authentication. An example of using JSON Web Token (JWT) key-pair to access SPCS services is [linked here](https://docs.snowflake.com/en/developer-guide/snowpark-container-services/tutorials/tutorial-1#optional-access-the-public-endpoint-programmatically).

In [None]:
import json
import numpy as np
from pprint import pprint
import requests

# Header uses Programmatic Access Token (PAT) from the connection to authenticate
headers = {'Authorization': f'Snowflake Token=\"{connection_parameters["password"]}\"'}

def build_service_payload(features_pd):
    df = features_pd.copy()
    # Convert datetime columns to ISO strings
    for col_name, dtype in df.dtypes.items():
        if str(dtype).startswith("datetime"):
            df[col_name] = df[col_name].dt.strftime("%Y-%m-%d %H:%M:%S")
    # Replace NaN with None
    df = df.where(pd.notnull(df), None)
    # Convert NumPy scalar types to Python scalars
    df = df.apply(lambda s: s.map(lambda x: x.item() if isinstance(x, np.generic) else x))

    # Select exactly the model's expected input columns (order matters)
    payload_df = df[feature_columns]

    # Build payload: each row is [row_index, {column_name: value, ...}]
    records = payload_df.to_dict(orient="records")
    return {"data": [[i, rec] for i, rec in enumerate(records)]}

# Build JSON payload
data = build_service_payload(features_pd)

# Send over HTTP
def send_request(data: dict):
    output = requests.post(URL, json=data, headers=headers)
    if output.status_code != 200:
        try:
            print("Response body (error):", output.text)
        except Exception:
            pass
    assert (output.status_code == 200), f"Failed to get response from the service. Status code: {output.status_code}"
    return output.content

# Do the prediction
results = send_request(data=data)
print("Predicted Trip Duration (minutes) from SPCS via HTTP:")
pprint(json.loads(results))