# AutoML end-to-end with OpenFE and AutoGluon
This example notebook uses [Snowflake ML](https://docs.snowflake.com/en/developer-guide/snowflake-ml/overview) and the training dataset created from the Feature Store. 

The dataset based on the [Boston Housing Dataset](https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset) and has features created by [OpenFE](https://openfe-document.readthedocs.io/en/latest/). 

This notebook creates a model to predict the median house value in neighborhoods.

UNSUPPORTED BY SNOWFLAKE - CUSTOMER SUPPORTED ONLY
Copyright (c) 2025 Snowflake Inc. All rights reserved.

In [None]:
# save a list of the current packages, so we can filter them out later when deploying
!pip freeze > original_packages.txt

In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
show tables like 'DEMO_BOSTON_HOUSING_%'

In [None]:
# input data for feature engineering. Note: this dataset has all uppercase column names
table_name = 'DEMO_BOSTON_HOUSING_TRAINING_2025_02_20'
# feature to be predicted 
target_feature = 'MEDV'
# unique / key column name
feature_store_join_key = 'ID'

In [None]:
# import numpy
import numpy as np

# Snowpark ML
from snowflake.ml.modeling.xgboost import XGBRegressor, XGBClassifier
from snowflake.ml._internal.utils import identifier
from snowflake.ml.registry import Registry

# used to create train and test datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# for plots
import seaborn as sns
import matplotlib.pyplot as plt

#Snowflake feature store
from snowflake.ml.feature_store import FeatureStore, FeatureView, Entity, CreationMode

# helper to set entry details based on Notebook
import os


In [None]:
# get data from Snowflake. This is a public dataset
data = session.table(table_name).to_pandas()
data.head()

## Dataset Details
Each record in the database describes a Boston suburb or town. The data was drawn from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

| **Name**    | **Description**                                                           |
|---------|-----------------------------------------------------------------------|
| CRIM    | per capita crime rate by town                                         |
| ZN      | proportion of residential land zoned for lots over 25000 sq.ft.       |
| INDUS   | proportion of non-retail business acres per town                      |
| CHAS    | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) |
| NOX     | nitric oxides concentration (parts per 10 million)                    |
| RM      | average number of rooms per dwelling                                  |
| AGE     | proportion of owner-occupied units built prior to 1940                |
| DIS     | weighted distances to five Boston employment centres                  |
| RAD     | index of accessibility to radial highways                             |
| TAX     | full-value property-tax rate per 10000usd                             |
| PTRATIO | pupil-teacher ratio by town                                           |
| LSTAT   | % lower status of the population                                      |

In [None]:
# identify the target the we will be predicting and remove it from the data used from training (inputs)
target = data[target_feature]
inputs = data
inputs.drop(feature_store_join_key, axis=1, inplace=True)

In [None]:
inputs.drop('AUTOFE_F_5', axis=1, inplace=True)
inputs.drop('AUTOFE_F_17', axis=1, inplace=True)


In [None]:
# create create a train and test dataset

x_train, x_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

In [None]:
# check the new features
x_train.head()

In [None]:
categorical_column_names = x_train.select_dtypes(include=['object','category']).columns
if len(categorical_column_names) >0:
    print("One hot encoding for "+str(categorical_column_names.to_list()))
    x_train = pd.get_dummies(x_train, columns=categorical_column_names, drop_first=True)

In [None]:
feature_columns = [col for col in x_train.columns if col != target_feature]
label_column = target_feature

In [None]:
predictor = XGBRegressor( input_cols=feature_columns, 
    label_cols=label_column, 
    max_depth=3
  )

In [None]:
a = predictor.fit(x_train)

In [None]:
result = predictor.predict(x_train)

In [None]:
result

In [None]:
# show the model results on the training data
predictor.score(x_train)

In [None]:
# merge the prediction with the original data using the index to ensure the correct prediction is matched with the correct row
results = pd.concat([data, result['OUTPUT_'+target_feature]], axis=1)

In [None]:
# show the results actual vs. predicted
results[[target_feature,'OUTPUT_'+target_feature]].head()

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(data[target_feature], results['OUTPUT_'+target_feature])
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()


In [None]:
# Checking Normality of errors
sns.distplot(data[target_feature]-results['OUTPUT_'+target_feature])
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
# retrieve the notebook name as we will use it to name the feature store and deployment
notebook_name = os.environ.get('OBJECT_NAME', 'NOTEBOOK')
notebook_name = notebook_name.replace(' ','_')
print(notebook_name)

## Model Registry

In [None]:
from snowflake.ml.registry import Registry
from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
native_registry = Registry(session=session)
#model_name = Notebook_name+"_"+predictor.model_best+"_model"

model_name='BOSTON_MODEL_SNOWFLAKE'

In [None]:
# so the model can be deployed correctly we need to identify what additional packages are required 
!pip freeze > installed_packages.txt

!diff -u0 original_packages.txt installed_packages.txt | grep -e "^+[a-zA-Z]" > new_packages.txt

In [None]:
# create a list of the packages that are required
with open('new_packages.txt') as f:
    need = f.read().splitlines()

packages_needed = [x.replace('+', '').replace(' ', '') for x in need ]

In [None]:
mr = Registry(
        session=session,
        options={'enable_monitoring':False}
)

In [None]:
# register the model
from snowflake.ml import version

mv = mr.log_model(
    model_name=model_name,
    model=predictor,
    comment='Model created using notebook '+notebook_name,
)

In [None]:
# get the model versions
mr.get_model(model_name).show_versions()

In [None]:
# get the latest version
default_version =  mr.get_model(model_name).version("LAST")

## Snowpark Container Services

In [None]:
# create a image repository 
create if not exists image repository DB.SERVICES.REPOSITORY;

In [None]:
# spcs deployment details
compute_pool_name = "INFERENCE_CP"
image_repo_name = f"DB.SERVICES.REPOSITORY"
num_spcs_nodes = '1'
spcs_instance_family = 'CPU_X64_M'
service_name_without_namespace = 'INFERENCE_SERVICE'
service_name = f'DB.PUBLIC.{service_name_without_namespace}'
print(service_name)

In [None]:
# create compute pool
session.sql(f"create compute pool if not exists {compute_pool_name} \
            min_nodes={num_spcs_nodes} \
            max_nodes={num_spcs_nodes} \
            instance_family={spcs_instance_family} \
            auto_resume=True \
            auto_suspend_secs=300").collect()

In [None]:
# create the service with the latest version of the model
default_version.create_service(service_name=service_name,
                  service_compute_pool=compute_pool_name,
                  image_repo=image_repo_name,
                  build_external_access_integration="ALLOW_ALL_INTEGRATION",
                  max_instances=int(num_spcs_nodes),
                  ingress_enabled=True)

In [None]:
# check the service is created and running
session.sql("show services like '"+service_name_without_namespace+"'")

In [None]:
session.sql("show endpoints in service "+service_name_without_namespace)

In [None]:
# show test dataframe
type(x_train)


In [None]:
# make predictions using model deployed to SPCS
start = time.time()
service_predictions = default_version.run(x_train,function_name="predict",
                                         service_name=service_name_without_namespace)
service_predictions
finish = time.time()
print("Elapsed Seconds: "+str(finish-start))
print("Rows: "+str(x_train.shape[0]))
print("Columns: "+str(x_train.shape[-1]))


In [None]:
# make predictions using model deployed to SPCS
start = time.time()
service_predictions = default_version.run(x_train,function_name="explain",
                                         service_name=service_name_without_namespace)
service_predictions
finish = time.time()
print("Elapsed Seconds: "+str(finish-start))
print("Rows: "+str(x_train.shape[0]))
print("Columns: "+str(x_train.shape[-1]))


## Suspend the service and pool

In [None]:
#session.sql("alter service "+service_name_without_namespace+" suspend")
#session.sql("alter compute pool "+compute_pool_name+" suspend")