In [3]:
import sys
import os

src_path = os.environ["src_path"]
sys.path.append(src_path)

In [7]:
import src.components.feature_group_config as config
from comet_ml import Experiment

In [8]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value= config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_METADATA.name,
    version = config.FEATURE_GROUP_METADATA.version
)

2025-03-12 14:47:43,766 INFO: Initializing external client
2025-03-12 14:47:43,769 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-12 14:47:44,748 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214715


RestAPIError: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1214715/featurestores/1203325/featuregroups/ticket_demand_feature_group2). Server response: 
HTTP code: 404, HTTP reason: Not Found, body: b'{"errorCode":270009,"usrMsg":"feature group name: ticket_demand_feature_group2 feature group version: 3","errorMsg":"Featuregroup wasn\'t found."}', error code: 270009, error msg: Featuregroup wasn't found., user msg: feature group name: ticket_demand_feature_group2 feature group version: 3

In [12]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet.
    feature_store.create_feature_view(
        name = config.FEATURE_VIEW_METADATA.name,
        version = config.FEATURE_VIEW_METADATA.version
    )
except:
    print('Feature view already existed. Skip creation.')

# get feature view
feature_view = feature_store.get_feature_view(
    name = config.FEATURE_VIEW_METADATA.name,
    version = config.FEATURE_VIEW_METADATA.version
)


Feature view already existed. Skip creation.


RestAPIError: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1214715/featurestores/1203325/featureview/ticket_demand_feature_view2/version/3). Server response: 
HTTP code: 404, HTTP reason: Not Found, body: b'{"errorCode":270181,"usrMsg":"There exists no feature view with the name ticket_demand_feature_view2 and version 3.","errorMsg":"Feature view wasn\'t found."}', error code: 270181, error msg: Feature view wasn't found., user msg: There exists no feature view with the name ticket_demand_feature_view2 and version 3.

In [11]:
data, _ = feature_view.training_data(
    description = "Time-series hourly ticket demand values."
)

NameError: name 'feature_view' is not defined

In [9]:
# drop `date` column
data.drop('seconds', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour` 
data.sort_values(by=['sub_region_code', 'date'], inplace=True)
data

NameError: name 'data' is not defined

In [10]:
# transform the batch of data to features and target
from src.components.data_info import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    data,
    input_seq_len=24*28*1,
    step_size=23
)

features_and_target = features.copy()
features_and_target['target_demand_value_next_hour'] = targets

print(f'{features_and_target.shape=}')

NameError: name 'data' is not defined

In [None]:
features_and_target

In [None]:
features_and_target.date.min()

In [None]:
features_and_target.date.max()

In [None]:
# split the data
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.components.data_info import train_test_split

cutoff_date = pd.to_datetime(date.today() - timedelta(days=60), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name = 'target_demand_values_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')
print(f"Training data range: {X_train['date'].min()} to {X_train['date'].max()}")
print(f"Testing data range: {X_test['date'].min()} to {X_test['date'].max()}")

In [None]:
x_tr = X_train.drop(['date'], axis=1)
x_ts = X_test.drop(['date'],axis=1)

In [None]:
# baseline model Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train the linear regression model
model = LinearRegression()
model.fit(x_tr, y_train)

In [None]:
y_pred = model.predict(x_ts)
y_pred

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error

def evaluate_model(y_test, y_pred):
    test_mae = mean_absolute_error(y_test, y_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_pred)
    return f"MAE is {test_mae:.4f} and MAPE is {test_mape:.4f}"

In [None]:
evaluate_model(y_test, y_pred)

In [None]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.components.model_info import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # picking hyper-parameters
    hyperparams = {
        "metrics": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves",2,256),
        "feature_fraction":trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100)
    }

    tss = KFold(n_splits=5)
    scores = []

    for train_index, val_index in tss.split(X_train):
        #split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index, :], y_train.iloc[val_index]

        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train, y_train)

        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    return np.array(scores).mean()

In [None]:
import warnings
warnings.filterwarnings("ignore")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trails=6)

In [None]:
best_params = study.best_trial.params
print(f'{best_params=}')

In [None]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [None]:
from src.components.model_info import evaluate_model

predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae:.4f}')

In [None]:
# plot the result
from src.plot import plot_one_sample

plot_one_sample(
    example_id=1,
    features=X_test,
    targets=y_test,
    predictions = pd.Series(predictions)
)

In [None]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR/'LGB_model.pkl')

In [None]:
# in order to save to model for model registry we have to create schema first
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)


In [None]:
model_registry = project.get_model_registry()
model = model_registry.sklearn.create_model(
    name="ticket_demand_predictor_next_hour",
    metrics = {"test_mae": test_mae},
    description = "LightGBM regressor with a bit of hyper-parameter tuning",
    input_example = X_train.sample(),
    model_schema = model_schema
)

model.save(str(MODELS_DIR/'LGB_model.pkl'))