# AutoML end-to-end with OpenFE and AutoGluon
This example notebook uses [AutoGluon](https://auto.gluon.ai/dev/index.html) and the training table data. 

The dataset based on the [London Bike Dataset](hhttps://www.kaggle.com/datasets/kalacheva/london-bike-share-usage-dataset)

This notebook uses the AutoGluon TimeSeries API to predict hourly bike rentals at Hyde Park Corner.

# UNSUPPORTED BY SNOWFLAKE - CUSTOMER SUPPORTED ONLY

# Copyright (c) 2025 Snowflake Inc. All rights reserved.

In [None]:
# save a list of the current packages, so we can filter them out later when deploying
!pip freeze > original_packages.txt

In [None]:
# This notebook can execute either in Contain Notebooks or externally in a Python environment.
# Depending on how you run this notebook, you will need to uncomment/comment the appropriate lines of code below. 

# For Container Notebooks, uncomment lines 6 to 11.
# Import python packages
#import streamlit as st
#import pandas as pd

# We can also use Snowpark for our analyses!
#from snowflake.snowpark.context import get_active_session
#session = get_active_session()

# For external Python environments, uncomment lines 15 to 24.
from snowflake.snowpark import Session
connection_parameters = {
      "account": "",
      "user": "",
      "password": "",
      "role": "",
      "warehouse": "",
      "database": "",
      "schema": ""
    }
session = Session.builder.configs(connection_parameters).create()

In [None]:
# Load data from Snowflake
data = session.table('HYDE_PARK_CORNER_20150501_20150615_HOURLY').to_pandas()
print(data.head())

In [None]:
# split the data into train and test sets
import pandas as pd

def train_test_split_sequential(df, test_size=0.2):
    """Splits a DataFrame into train and test sets sequentially.

    Args:
        df (pd.DataFrame): The input DataFrame.
        test_size (float): The proportion of the dataset to include in the test split.

    Returns:
        tuple: A tuple containing the train and test DataFrames.
    """
    if not 0 < test_size < 1:
        raise ValueError("test_size must be between 0 and 1")

    test_index = int(len(df) * (1 - test_size))

    train_df = df.iloc[:test_index].copy()
    test_df = df.iloc[test_index:].copy()

    return train_df, test_df

train_df, test_df = train_test_split_sequential(data, test_size=0.2)

In [None]:
# install autogluon packages
%pip install autogluon==1.2 autogluon.common==1.2 autogluon.core==1.2 autogluon.features==1.2 autogluon.multimodal==1.2 autogluon.tabular==1.2 autogluon.timeseries==1.2 --quiet

In [None]:
# if executing in a Python environment, restart the kernel to avoid conflicts with the installed packages
%restart_python

In [None]:
# define the time series data frame time and id columns
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
train_data = TimeSeriesDataFrame.from_data_frame(
    train_df,
    id_column="START_STATION_NAME",
    timestamp_column="TS"
)
train_data.head()

In [None]:
# fit the time series predictor, forcasting horizon of 48 hours and prediction target of NUMRENTALS
predictor = TimeSeriesPredictor(
    prediction_length=48,
    path="autogluon-hourly",
    target="NUMRENTALS",
    eval_metric="MAPE",
)

# Note: the excluded models are added as the brackets need to be escaped in the string and cause issues later.
y = predictor.fit(
    train_data,
    presets="best_training",
    time_limit=600,
    excluded_model_types=["ChronosZeroShot[bolt_base]", "ChronosFineTuned[bolt_small]"],
)

In [None]:
# make predictions
predictions = predictor.predict(train_data)
predictions

In [None]:
predictions.to_csv("predictions.csv")

In [None]:
import pandas as pd
df1 = pd.read_csv("predictions.csv")
df1

In [None]:
predictor.save()

## Model Registry

In [None]:
from autogluon.timeseries import TimeSeriesPredictor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
import json

from snowflake.ml.registry import Registry
from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
native_registry = Registry(session=session)
#model_name = Notebook_name+"_"+predictor.model_best+"_model"

model_name='HYDE_PARK_CORNER_HOURLY'

In [None]:
path1 = "/home/app/autogluon-hourly"
p = TimeSeriesPredictor.load(path1, require_version_match=False)

In [None]:
p

In [None]:
# as we are using a custom model we need to define the input and output schema
class AutoGluonModel(custom_model.CustomModel):
    def __init__(self, context: custom_model.ModelContext) -> None:
        super().__init__(context)
        path = self.context.path("model_dir")
        # path = "/Workspace/Users/mehernosh.garda@snowflake.com/autogluon-hourly"
        self.predictor = TimeSeriesPredictor.load(path, require_version_match=False)
    
    @custom_model.inference_api
    def predict(self, input_pdf: pd.DataFrame) -> pd.DataFrame:
        input_data = TimeSeriesDataFrame.from_data_frame(
            input_pdf,
            id_column="START_STATION_NAME",
            timestamp_column="TS"
        )
        predicted_timeseries_output = self.predictor.predict(input_data)
        return predicted_timeseries_output.to_data_frame()
        

# Create ModelContext that points to our model file
autogluon_mc = custom_model.ModelContext(
	models={ # This should be for models that is supported by Model Registry
	},
	artifacts={ # Everything not supported needs to be here
		'model_dir': "/home/app/autogluon-hourly"
	}
)

autogluon_custom_model = AutoGluonModel(autogluon_mc)

In [None]:
train_df.columns = map(lambda x: str(x).upper(), train_df.columns)
train_df

from snowflake.snowpark.types import IntegerType, StringType, StructType, StructField
myschema = StructType([StructField("START_STATION_NAME", StringType()), StructField("TS", StringType()), StructField("NUMRENTALS", IntegerType())])
df4 = session.create_dataframe(train_df, myschema)
df4.show()

df4.write.mode("overwrite").save_as_table("HYDE_PARK_CORNER_20150501_20150615_HOURLY_train_df_feb242025")


In [None]:
# df1 contains the predicted dataframe
# input is train_df
# output is df1
predict_sign = model_signature.infer_signature(input_data=train_df, output_data=df1)

In [None]:
predict_sign

In [None]:
# so the model can be deployed correctly we need to identify what additional packages are required 
!pip freeze > installed_packages.txt

!diff -u0 original_packages.txt installed_packages.txt | grep -e "^+[a-zA-Z]" > new_packages.txt

In [None]:
# create a list of the packages that are required
with open('new_packages.txt') as f:
    need = f.read().splitlines()

packages_needed = [x.replace('+', '').replace(' ', '') for x in need ]

In [None]:
native_registry.show_models()

In [None]:
# register the model
model_name='HYDE_PARK_CORNER_HOURLY'

mv = native_registry.log_model(
    autogluon_custom_model,
    model_name=model_name,
    pip_requirements=packages_needed,
    signatures={
        "predict": predict_sign
    },
)

In [None]:
from snowflake.ml.registry import Registry
native_registry = Registry(session=session)
model_name='HYDE_PARK_CORNER_HOURLY'
# get the model versions
mr = native_registry.get_model(model_name)
version_df = mr.show_versions()
version_df.head()

In [None]:
# get the latest version
last_version_name = version_df['name'].iloc[-1]
latest_version = mr.version(last_version_name)

In [None]:
latest_version

## Snowpark Container Services

In [None]:
# spcs deployment details
compute_pool_name = "LONDON_BIKE_INFERENCE_CP"
image_repo_name = f"<db>.<schema>.REPOSITORY"
num_spcs_nodes = '1'
spcs_instance_family = 'CPU_X64_M'
service_name_without_namespace = 'LONDON_BIKE_INFERENCE_SERVICE'
service_name = f'<db>.<schema>.{service_name_without_namespace}'
print(service_name)

In [None]:
# create compute pool
session.sql(f"create compute pool if not exists {compute_pool_name} \
            min_nodes={num_spcs_nodes} \
            max_nodes={num_spcs_nodes} \
            instance_family={spcs_instance_family} \
            auto_resume=True \
            auto_suspend_secs=300").collect()

In [None]:
# create the service with the latest version of the model
latest_version.create_service(service_name=service_name,
                  service_compute_pool=compute_pool_name,
                  image_repo=image_repo_name,
                  build_external_access_integration="ALLOW_ALL_INTEGRATION",
                  max_instances=int(num_spcs_nodes),
                  ingress_enabled=True)

## Suspend the service and pool

In [None]:
session.sql("alter service "+service_name_without_namespace+" suspend")
session.sql("alter compute pool "+compute_pool_name+" suspend")