# Partitioned Custom ML Model with Model Registry

### Partitioned restaurant traffic forecasting model

The dataset is loaded locally from the `Partitioned_Custom_Model_Restaurant_Traffic_Data.csv` file.

Change `"MY_DB"` and `"MY_SCHEMA"` to your desired existing database and schema.

In [1]:
from datetime import timedelta

import pandas as pd

from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
from snowflake.ml.registry import registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F


session = Session.builder.configs(SnowflakeLoginOptions(connection_name="my_connection")).create()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


[Row(status='Statement executed successfully.')]

In [2]:
REGISTRY_DATABASE_NAME = "MY_DB"
REGISTRY_SCHEMA_NAME = "MY_SCHEMA"

reg = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

#### The dataset contains an epoch timestamp in milliseconds, a store ID which will later be used as a partition column, a feature column `COLLEGE_TOWN`, and a target to be forecasted, `HOURLY_TRAFFIC`.

In [3]:
# Load data from csv file into pandas dataframe.
test_df_pandas = pd.read_csv("Partitioned_Custom_Model_Restaurant_Traffic_Data.csv")
test_df = session.create_dataframe(test_df_pandas)
test_df.show()

--------------------------------------------------------------------
|"EPOCH"          |"STORE_ID"  |"COLLEGE_TOWN"  |"HOURLY_TRAFFIC"  |
--------------------------------------------------------------------
|1529154000000.0  |1.0         |1.0             |82                |
|1529182800000.0  |1.0         |1.0             |2                 |
|1529247600000.0  |1.0         |1.0             |35                |
|1529269200000.0  |1.0         |1.0             |9                 |
|1529326800000.0  |1.0         |1.0             |114               |
|1529514000000.0  |1.0         |1.0             |24                |
|1529697600000.0  |1.0         |1.0             |31                |
|1529424000000.0  |1.0         |1.0             |28                |
|1529575200000.0  |1.0         |1.0             |13                |
|1529931600000.0  |1.0         |1.0             |110               |
--------------------------------------------------------------------



In [4]:
test_df_pandas

Unnamed: 0,EPOCH,STORE_ID,COLLEGE_TOWN,HOURLY_TRAFFIC
0,1.529154e+12,1.0,1.0,82
1,1.529183e+12,1.0,1.0,2
2,1.529248e+12,1.0,1.0,35
3,1.529269e+12,1.0,1.0,9
4,1.529327e+12,1.0,1.0,114
...,...,...,...,...
5209580,1.668722e+12,112.0,0.0,0
5209581,1.668722e+12,140.0,1.0,0
5209582,1.668722e+12,163.0,0.0,0
5209583,1.668722e+12,182.0,0.0,0


In [5]:
class ForecastingModel(custom_model.CustomModel):

    # Use the same decorator as for methods with FUNCTION inference.
    @custom_model.partitioned_inference_api
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:        
        import xgboost

        # Set the time column as our index.
        input_df = df.set_index('EPOCH')
        input_df.index = pd.to_datetime(df['EPOCH'], unit='ms')

        # Generate categorical features using the datetime index.
        input_df['HOUR'] = input_df.index.hour.astype("category")
        input_df['DAY_OF_WEEK'] = input_df.index.dayofweek.astype("category")
        input_df['MONTH'] = input_df.index.month.astype("category")
        input_df['YEAR'] = input_df.index.year.astype("category")
        
        input_df['COLLEGE_TOWN'] = input_df['COLLEGE_TOWN'].astype("category")
        
        # Use get_dummies (one-hot encoding) for categorical features.
        final = pd.get_dummies(data=input_df, columns=['COLLEGE_TOWN', 'HOUR', 'MONTH', 'YEAR', 'DAY_OF_WEEK'])

        # Define the train & forecast split thresholds.
        today = pd.to_datetime('2022-10-01')
        yesterday = today - timedelta(days=1)
        four_weeks = today + timedelta(days=28)
        tomorrow = today + timedelta(days=1)

        # Train data starts on June 16th 2018 and ends on September 30th.
        train = final[(final.index >= pd.to_datetime('16-Jun-2018')) & (final.index <= pd.to_datetime(yesterday))]
        
        # The forecast starts from October 1st 2022 and goes 4 weeks into the future.
        forecast = final[(final.index >= pd.to_datetime(tomorrow)) & (final.index <= pd.to_datetime(four_weeks))]

        # Remove the target from the input dataset, and construct target dataset.
        X_train = train.drop('HOURLY_TRAFFIC', axis=1)
        y_train = train['HOURLY_TRAFFIC']

        X_forecast = forecast.drop('HOURLY_TRAFFIC', axis=1)
        
        # Train an XGBoost regression model.
        model = xgboost.XGBRegressor(n_estimators=200, n_jobs=1)
        model.fit(X_train, y_train, verbose=False)

        # Predict the hourly forecast for the future dates and make sure no predictions are less than zero.
        forecast['PREDICTION'] = model.predict(X_forecast)
        forecast['EPOCH_OUT'] = [t.value // 10**9 for t in forecast.index]
        forecast = forecast[['EPOCH_OUT', 'PREDICTION']]
        forecast = forecast.sort_index()
        forecast.loc[forecast['PREDICTION'] < 0, 'PREDICTION'] = 0

        return forecast

In [6]:
my_forecasting_model = ForecastingModel()

#### The predict method can be tested locally by using a pandas dataframe directly. Here we can run `predict` for a single partition.

In [7]:
my_forecasting_model.predict(test_df_pandas.loc[test_df_pandas['STORE_ID'] == 1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forecast['PREDICTION'] = model.predict(X_forecast)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forecast['EPOCH_OUT'] = [t.value // 10**9 for t in forecast.index]


Unnamed: 0_level_0,EPOCH_OUT,PREDICTION
EPOCH,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-10-02 07:00:00,1664694000,77.787636
2022-10-02 08:00:00,1664697600,76.995056
2022-10-02 09:00:00,1664701200,76.751877
2022-10-02 10:00:00,1664704800,76.600456
2022-10-02 11:00:00,1664708400,96.709358
...,...,...
2022-10-28 18:00:00,1666980000,30.727842
2022-10-28 19:00:00,1666983600,30.608843
2022-10-28 20:00:00,1666987200,31.198336
2022-10-28 21:00:00,1666990800,4.420763


#### Log the model, specifying the `function_type: "TABLE_FUNCTION"` option.

In [9]:
options = {
    "function_type": "TABLE_FUNCTION",
}

mv = reg.log_model(
    my_forecasting_model,
    model_name="forecast",
    version_name="v13",
    conda_dependencies=["pandas", "scikit-learn", "xgboost"],
    options=options,
    signatures={
        "predict": model_signature.ModelSignature(
            inputs=[
                model_signature.FeatureSpec(name="EPOCH", dtype=model_signature.DataType.DOUBLE),
                model_signature.FeatureSpec(name="STORE_ID", dtype=model_signature.DataType.DOUBLE),
                model_signature.FeatureSpec(name="COLLEGE_TOWN", dtype=model_signature.DataType.DOUBLE),
                model_signature.FeatureSpec(name="HOURLY_TRAFFIC", dtype=model_signature.DataType.INT64),
            ],
            outputs=[
                model_signature.FeatureSpec(name="EPOCH_OUT", dtype=model_signature.DataType.FLOAT),
                model_signature.FeatureSpec(name="PREDICTION", dtype=model_signature.DataType.FLOAT),
            ],
        )
    },
)

  return next(self.gen)


#### Use the `run` method for inference, specifying the partition column.

In [10]:
result = mv.run(test_df, partition_column="STORE_ID")
result.select("EPOCH_OUT", "PREDICTION", "STORE_ID").to_pandas()

Unnamed: 0,EPOCH_OUT,PREDICTION,STORE_ID
0,1.664694e+09,77.972305,40.0
1,1.664698e+09,77.546402,40.0
2,1.664701e+09,76.611122,40.0
3,1.664705e+09,76.296654,40.0
4,1.664708e+09,98.838348,40.0
...,...,...,...
86395,1.666980e+09,31.088211,78.0
86396,1.666984e+09,30.569330,78.0
86397,1.666987e+09,32.221611,78.0
86398,1.666991e+09,6.004010,78.0
