In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import src.config as config

In [4]:
try:
    HOPSWORKS_PROJECT_NAME = "taxidemand_predict"
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception(
        'Create an .env file on the project root with the HOPSWORKS_PROJECT_NAME and HOPSWORKS_API_KEY'
    )

In [5]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-08-11 06:37:29,623 INFO: Initializing external client
2025-08-11 06:37:29,624 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-08-11 06:37:33,776 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1242261


In [6]:
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

Feature view already existed. Skip creation.


In [7]:
# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

In [8]:
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation.


In [10]:
print("feature_group:", feature_group)
print("feature_view:", feature_view)

feature_group: <hsfs.feature_group.FeatureGroup object at 0x7e30b85025a0>
feature_view: <hsfs.feature_view.FeatureView object at 0x7e3075680770>


In [11]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.62s) 




In [12]:
ts_data.drop('pickup_ts', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
65603,2025-07-06 17:00:00+00:00,4,1
29704,2025-07-06 18:00:00+00:00,2,1
133049,2025-07-06 19:00:00+00:00,0,1
36236,2025-07-06 20:00:00+00:00,1,1
141658,2025-07-06 21:00:00+00:00,0,1
...,...,...,...
98057,2025-08-03 12:00:00+00:00,5,265
174369,2025-08-03 13:00:00+00:00,5,265
58460,2025-08-03 14:00:00+00:00,2,265
149374,2025-08-03 15:00:00+00:00,4,265


In [13]:
from typing import Optional, List
import pandas as pd
import plotly.express as px 

def plot_ts(
    ts_data: pd.DataFrame,
    locations: Optional[List[int]] = None
    ):
    """
    Plot time-series data
    """
    ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data

    fig = px.line(
        ts_data_to_plot,
        x="pickup_hour",
        y="rides",
        color='pickup_location_id',
        template='none',
    )

    fig.show()

plot_ts(ts_data, locations=[43])

In [14]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*27, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 265/265 [00:03<00:00, 84.99it/s]

features_and_target.shape=(265, 651)





In [15]:
features_and_target

Unnamed: 0,rides_previous_648_hour,rides_previous_647_hour,rides_previous_646_hour,rides_previous_645_hour,rides_previous_644_hour,rides_previous_643_hour,rides_previous_642_hour,rides_previous_641_hour,rides_previous_640_hour,rides_previous_639_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,4.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2.0,0.0,1.0,3.0,2025-08-02 17:00:00+00:00,1,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-08-02 17:00:00+00:00,2,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-08-02 17:00:00+00:00,3,0.0
3,3.0,6.0,2.0,3.0,0.0,1.0,0.0,4.0,0.0,0.0,...,5.0,7.0,11.0,21.0,14.0,5.0,4.0,2025-08-02 17:00:00+00:00,4,7.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-08-02 17:00:00+00:00,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,43.0,34.0,32.0,15.0,11.0,17.0,6.0,3.0,0.0,0.0,...,36.0,44.0,49.0,50.0,64.0,56.0,48.0,2025-08-02 17:00:00+00:00,261,70.0
261,37.0,41.0,27.0,13.0,17.0,17.0,9.0,4.0,1.0,1.0,...,78.0,109.0,100.0,100.0,72.0,64.0,51.0,2025-08-02 17:00:00+00:00,262,93.0
262,88.0,71.0,73.0,68.0,60.0,33.0,20.0,15.0,5.0,1.0,...,106.0,127.0,139.0,134.0,123.0,94.0,108.0,2025-08-02 17:00:00+00:00,263,169.0
263,20.0,23.0,19.0,13.0,14.0,10.0,6.0,7.0,3.0,2.0,...,7.0,8.0,13.0,19.0,23.0,18.0,15.0,2025-08-02 17:00:00+00:00,264,23.0


In [None]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> from January 2022 up until 2 months ago
# test data -> last 2 months
# cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1), utc=True)
# cutoff_date = features_and_target["pickup_hour"].min() + (features_and_target["pickup_hour"].max() - features_and_target["pickup_hour"].min()) / 2


# print(f'{cutoff_date=}')


# X_train, y_train, X_test, y_test = train_test_split(
#     features_and_target,
#     cutoff_date,
#     target_column_name='target_rides_next_hour'   
# )

# print(f'{X_train.shape=}')
# # print(f'{y_train.shape=}')
# # print(f'{X_test.shape=}')
# # print(f'{y_test.shape=}')
# 

In [28]:
print(features_and_target["pickup_hour"].nunique())
print(features_and_target["pickup_hour"].min())
print(features_and_target["pickup_hour"].max())
print(features_and_target["pickup_hour"].head(10))


1
2025-08-02 17:00:00+00:00
2025-08-02 17:00:00+00:00
0   2025-08-02 17:00:00+00:00
1   2025-08-02 17:00:00+00:00
2   2025-08-02 17:00:00+00:00
3   2025-08-02 17:00:00+00:00
4   2025-08-02 17:00:00+00:00
5   2025-08-02 17:00:00+00:00
6   2025-08-02 17:00:00+00:00
7   2025-08-02 17:00:00+00:00
8   2025-08-02 17:00:00+00:00
9   2025-08-02 17:00:00+00:00
Name: pickup_hour, dtype: datetime64[ns, UTC]


In [33]:
split_index = int(len(features_and_target) * 0.8)
train_df = features_and_target.iloc[:split_index]
test_df  = features_and_target.iloc[split_index:]
print(split_index)
print(train_df.shape)
print(test_df.shape)

212
(212, 651)
(53, 651)


In [48]:
from src.data_split import train_test_split

# First try time-based split
X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date=pd.Timestamp("2025-07-20", tz="UTC"),
    target_column_name='target_rides_next_hour'
)

# If train set empty, fall back to index-based split
if X_train.shape[0] == 0:
    split_index = int(len(features_and_target) * 0.8)
    train_df = features_and_target.iloc[:split_index]
    test_df  = features_and_target.iloc[split_index:]
    X_train = train_df.drop(columns=["target_rides_next_hour"])
    y_train = train_df["target_rides_next_hour"]
    X_test  = test_df.drop(columns=["target_rides_next_hour"])
    y_test  = test_df["target_rides_next_hour"]

print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_test.shape =", X_test.shape)
print("y_test.shape =", y_test.shape)


X_train.shape = (212, 650)
y_train.shape = (212,)
X_test.shape = (53, 650)
y_test.shape = (53,)


In [49]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
    
    # sort X_train by `pikup_hour` inplace
    # so the TimeSeriesSplit will split the data in a consistent way
    X_train.sort_values('pickup_hour', inplace=True)

    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [50]:
study = optuna.create_study(direction="minimize")

[I 2025-08-11 07:22:46,558] A new study created in memory with name: no-name-ccce74ed-8d5d-49ea-9cbd-d49ae16abac2


In [51]:
study.optimize(objective, n_trials=1)

[I 2025-08-11 07:22:50,877] Trial 0 finished with value: 47.73759564778832 and parameters: {'num_leaves': 219, 'feature_fraction': 0.47647643024714564, 'bagging_fraction': 0.8649503915544099, 'min_child_samples': 20}. Best is trial 0 with value: 47.73759564778832.


In [52]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 219, 'feature_fraction': 0.47647643024714564, 'bagging_fraction': 0.8649503915544099, 'min_child_samples': 20}


In [53]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('functiontransformer', ...), ('temporalfeaturesengineer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function ave...x7e2ffacb9440>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,boosting_type,'gbdt'
,num_leaves,219
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [56]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=64.7307


In [57]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/home/talha-naeem/Documents/LLM Work/taxi_demand/models/model.pkl']

In [58]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [60]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR / 'model.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /home/talha-naeem/Documents/LLM Work/taxi_demand/models/model.pkl: 0.000%|          | 0/129940 elaps…

Uploading /home/talha-naeem/Documents/LLM Work/taxi_demand/notebooks/input_example.json: 0.000%|          | 0/…

Uploading /home/talha-naeem/Documents/LLM Work/taxi_demand/notebooks/model_schema.json: 0.000%|          | 0/5…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1242261/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)