# AutoML end-to-end with OpenFE and AutoGluon
This example notebook uses [AutoGluon](https://auto.gluon.ai/dev/index.html) and the training dataset created from the Feature Store. 

The dataset based on the [Boston Housing Dataset](https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset) and has features created by [OpenFE](https://openfe-document.readthedocs.io/en/latest/). 

This notebook creates a model to predict the median house value in neighborhoods.

In [None]:
# UNSUPPORTED BY SNOWFLAKE - CUSTOMER SUPPORTED ONLY

# Copyright (c) 2025 Snowflake Inc. All rights reserved.

In [None]:
# save a list of the current packages, so we can filter them out later when deploying
!pip freeze > original_packages.txt

In [None]:
# Install the AutoGluon packages
!pip install autogluon --quiet

In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
show tables like 'DEMO_BOSTON_HOUSING_TRAINING_%';

In [None]:
# input data for feature engineering
table_name = 'DEMO_BOSTON_HOUSING_TRAINING'
# feature to be predicted
target_feature = 'medv'
# unique / key column name
feature_store_join_key = 'ID'
# model name used for deployment
model_name='BOSTON_MODEL'

In [None]:
# Install required libraries
!pip install snowflake-connector-python pandas --quiet

# Use the new Container Services keypair authentication method
from generateJWT import JWTGenerator

# other supporting libraries
from datetime import timedelta
import argparse
import logging
import sys
import requests
import json

account = '<org>-<account>'.upper()
user = 'username'.upper()
role = 'SPCS_ROLE'.upper()
private_key_file_path = '/Users/rsa/rsa_key.p8'
endpoint = '<generated-endpoint-name>.snowflakecomputing.app'
endpoint_path = '/'


In [None]:
# import numpy
import numpy as np

# Import AutoGluon packages
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

# used for feature engineering
from openfe import OpenFE, transform, tree_to_formula

# used to creat train and test datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# for plots
import seaborn as sns
import matplotlib.pyplot as plt

#Snowflake feature store
from snowflake.ml.feature_store import FeatureStore, FeatureView, Entity, CreationMode

# helper to set entry details based on Notebook
import os


In [None]:
# get data from Snowflake. This is a public dataset
data = session.table(table_name).to_pandas()
data.head()

## Dataset Details
Each record in the database describes a Boston suburb or town. The data was drawn from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

| **Name**    | **Description**                                                           |
|---------|-----------------------------------------------------------------------|
| CRIM    | per capita crime rate by town                                         |
| ZN      | proportion of residential land zoned for lots over 25000 sq.ft.       |
| INDUS   | proportion of non-retail business acres per town                      |
| CHAS    | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) |
| NOX     | nitric oxides concentration (parts per 10 million)                    |
| RM      | average number of rooms per dwelling                                  |
| AGE     | proportion of owner-occupied units built prior to 1940                |
| DIS     | weighted distances to five Boston employment centres                  |
| RAD     | index of accessibility to radial highways                             |
| TAX     | full-value property-tax rate per 10000usd                             |
| PTRATIO | pupil-teacher ratio by town                                           |
| LSTAT   | % lower status of the population                                      |

In [None]:
# remove the join key from the data
data.pop(feature_store_join_key)


In [None]:
# using the engineered features and the data from Feature Store use AutoGluon to train a model
model_path = 'tmp'
predictor = TabularPredictor(label='medv', path=model_path).fit(data, time_limit=600 )

In [None]:
# show the model results on the training data
predictor.evaluate(data)

In [None]:
# show the model learning curve
predictor.learning_curves()

In [None]:
# graph the feature importance
fimportance = predictor.feature_importance(data)
fimportance = fimportance.sort_values('importance', ascending=True)

from matplotlib import pyplot as plt
plt.figure(figsize=(4,3))
plt.barh(fimportance.index, fimportance['importance'])
plt.title('Importance')

In [None]:
# sort the features by importance as a table
fimportance.sort_values('importance', ascending=False)

In [None]:
# make prediction using the training data (so we can review the results compared to the actuals)
y_pred = predictor.predict(data)

In [None]:
# merge the prediction with the original data using the index to ensure the correct prediction is matched with the correct row
results = pd.merge(data, y_pred)

In [None]:
y_pred

In [None]:
# show the results actual vs. predicted
results[['medv_x','medv_y']].head()

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(results['medv_x'], results['medv_y'])
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()


In [None]:
# Checking Normality of errors
sns.distplot(results['medv_x']-results['medv_y'])
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
# retrieve the notebook name as we will use it to name the feature store and deployment
Notebook_name = os.environ.get('OBJECT_NAME', 'NOTEBOOK')
Notebook_name = Notebook_name.replace(' ','_')
print(Notebook_name)

In [None]:
predictor.model_best

## Model Registry

In [None]:
from snowflake.ml.registry import Registry
from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
native_registry = Registry(session=session)

model_name='BOSTON_MODEL'

In [None]:
# as we are using a custom model we need to define the input and output schema
class AutoGluonModel(custom_model.CustomModel):
    def __init__(self, context: custom_model.ModelContext) -> None:
        super().__init__(context)
        context_path = self.context.path("model_dir")
        self.predictor = TabularPredictor.load(context_path, verbosity=4, require_version_match=False, require_py_version_match=False)
    
    @custom_model.inference_api
    def predict(self, input_pdf: pd.DataFrame) -> pd.DataFrame:
        import numpy as np
        input_pdf['INPUT_DATA'] = input_pdf['INPUT_DATA'].map(json.loads)
        fixed_input_pdf = pd.json_normalize(input_pdf['INPUT_DATA'])
        input_dataset = TabularDataset(fixed_input_pdf)
        output_np = self.predictor.predict(input_dataset)
        output_np = np.where(np.isnan(output_np), None, output_np)
        fixed_input_pdf['OUTPUT'] = output_np
        fixed_input_pdf = fixed_input_pdf.fillna(np.nan).replace([np.nan], [None])
        return fixed_input_pdf

In [None]:
# Create ModelContext that points to our model file
autogluon_mc = custom_model.ModelContext(
	models={ # This should be for models that is supported by Model Registry
	},
	artifacts={ # Everything not supported needs to be here
		'model_dir': model_path+"/"
	}
)

autogluon_custom_model = AutoGluonModel(autogluon_mc)

In [None]:
# call predict on the model, if running using container notebooks this will use the model in the model_path
def local_predict(input_pdf: pd.DataFrame) -> pd.DataFrame:
    import numpy as np
    input_pdf['INPUT_DATA'] = input_pdf['INPUT_DATA'].map(json.loads)
    fixed_input_pdf = pd.json_normalize(input_pdf['INPUT_DATA'])
    input_dataset = TabularDataset(fixed_input_pdf)
    predictor = TabularPredictor.load(model_path+"/")
    output_np = predictor.predict(input_dataset)
    output_np = np.where(np.isnan(output_np), None, output_np)
    fixed_input_pdf['OUTPUT'] = output_np
    fixed_input_pdf = fixed_input_pdf.fillna(np.nan).replace([np.nan], [None])
    return fixed_input_pdf

In [None]:
# create a temporary table in Snowflake with the training data
test_snowdf = session.write_pandas(train_f, table_name="temp_results", table_type="temporary", auto_create_table=True)

test_snowdf

from snowflake.snowpark import functions as F
test_snowdf_cached1 = test_snowdf.na.fill(0).drop('OUTPUT').cache_result()
test_snowdf_cached2 = test_snowdf_cached1.with_column('INPUT_DATA', F.to_varchar(F.object_construct_keep_null(F.col("*"))))\
                     .select('INPUT_DATA').cache_result()
test_pdf_for_mr = test_snowdf_cached2.limit(100).to_pandas()
local_test_results_pdf = local_predict(test_pdf_for_mr.copy())
predict_sign = model_signature.infer_signature(input_data=test_pdf_for_mr, output_data=local_test_results_pdf)

In [None]:
# this is the calling signature for the model
predict_sign

In [None]:
# so the model can be deployed correctly we need to identify what additional packages are required 
!pip freeze > installed_packages.txt

!diff -u0 original_packages.txt installed_packages.txt | grep -e "^+[a-zA-Z]" > new_packages.txt

In [None]:
# create a list of the packages that are required
with open('new_packages.txt') as f:
    need = f.read().splitlines()

packages_needed = [x.replace('+', '').replace(' ', '') for x in need ]

In [None]:
# register the model
from snowflake.ml import version

mv = native_registry.log_model(
    autogluon_custom_model,
    model_name=model_name,
    pip_requirements=packages_needed,
    signatures={
        "predict": predict_sign
    },
)

In [None]:
# get the model versions
mr = native_registry.get_model(model_name)
version_df = mr.show_versions()
version_df.head()

In [None]:
# get the latest version
last_version_name = version_df['name'].iloc[-1]
latest_version = mr.version(last_version_name)

## Snowpark Container Services

In [None]:
# create a image repository 
create if not exists image repository DB.SERVICES.REPOSITORY;

In [None]:
# spcs deployment details
compute_pool_name = "my-compute-pool"
image_repo_name = f"<database>.<schema>.<image-REPOSITORY-name>"
num_spcs_nodes = '1'
spcs_instance_family = 'CPU_X64_M'
service_name_without_namespace = 'INFERENCE_SERVICE'
service_name = f'<database>.<schema>.{service_name_without_namespace}'
print(service_name)

In [None]:
# create compute pool
session.sql(f"create compute pool if not exists {compute_pool_name} \
            min_nodes={num_spcs_nodes} \
            max_nodes={num_spcs_nodes} \
            instance_family={spcs_instance_family} \
            auto_resume=True \
            auto_suspend_secs=300").collect()

In [None]:
# create the service with the latest version of the model
latest_version.create_service(service_name=service_name,
                  service_compute_pool=compute_pool_name,
                  image_repo=image_repo_name,
                  build_external_access_integration="ALLOW_ALL_INTEGRATION",
                  max_instances=int(num_spcs_nodes),
                  ingress_enabled=True)

In [None]:
# check the service is created and running
session.sql("show services like '"+service_name_without_namespace+"'")

In [None]:
session.sql("show endpoints in service "+service_name_without_namespace)

In [None]:
# show test dataframe
test_snowdf_cached2.show()


In [None]:
# make predictions using model deployed to SPCS
start = time.time()
service_predictions = latest_version.run(test_snowdf_cached2,function_name="predict",
                                         service_name=service_name_without_namespace).drop('INPUT_DATA')
service_predictions.show()
finish = time.time()
print("Elapsed Seconds: "+str(finish-start))
print("Rows: "+str(test_snowdf_cached2.count()))
print("Columns: "+str(len(service_predictions.columns)))
test_snowdf_cached2.describe()

## Suspend the service and pool

In [None]:
session.sql("alter service "+service_name_without_namespace+" suspend")
session.sql("alter compute pool "+compute_pool_name+" suspend")