# NOTEBOOK 2: END TO END ML USING SNOWPARK AND SCIKIT-LEARN

In this notebook we fit/train a Scikit-Learn ML pipeline that includes common feature engineering tasks such as Imputations, Scaling and One-Hot Encoding. The pipeline also includes a `RandomForestRegressor` model that will predict median house values in California. 

We will fit/train the pipeline using a Snowpark Python Stored Procedure (SPROC) and then save the pipeline to a Snowflake stage. This example concludes by showing how a saved model/pipeline can be loaded and run in a scalable fashion on a snowflake warehouse using Snowpark Python User-Defined Functions (UDFs). 

![Snowpark ML](/images/snowpark_ml.png)

### Create a session with Snowflake

In [None]:
# Snowpark
import snowflake.snowpark
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.session import Session
from snowflake.snowpark import version as v
import json

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import datetime
import io
import joblib

with open('creds.json') as f:
    data = json.load(f)
    USERNAME = data['username']
    PASSWORD = data['password']
    SF_ACCOUNT = data['sf_account']
    SF_WH = data['sf_wh']
    SF_DB = data['sf_db']
    SF_SCHEMA = data['sf_schema']

CONNECTION_PARAMETERS = {
   "account": SF_ACCOUNT,
   "user": USERNAME,
   "password": PASSWORD,
   "database": SF_DB,
   "schema": SF_SCHEMA,
   "warehouse": SF_WH
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()
session.add_packages('snowflake-snowpark-python', 'scikit-learn', 'pandas', 'numpy', 'joblib', 'cachetools')

### Create stages to save the ML model/pipeline and permanent UDFs

In [None]:
query = "create or replace stage models" +\
        " directory = (enable = true)" +\
        " copy_options = (on_error='skip_file')"
        
session.sql(query).collect()

In [None]:
query = "create or replace stage udf" +\
        " copy_options = (on_error='skip_file')"
        
session.sql(query).collect()

### Stored Proc fits the pipeline and the model and then saves it in Snowflake

In [None]:
def save_file(session, model, path):
  input_stream = io.BytesIO()
  joblib.dump(model, input_stream)
  session._conn._cursor.upload_stream(input_stream, path)
  return "successfully created file: " + path

def train_model(session: snowflake.snowpark.Session) -> float:
    snowdf = session.table("HOUSING_DATA")
    # split the train and test set
    snowdf_train, snowdf_test = snowdf.random_split([0.8, 0.2], seed=82) # use seed to make the split repeatable
    

    # save the train and test sets as time stamped tables in Snowflake 
    snowdf_train.write.mode("overwrite").save_as_table("HOUSING_TRAIN")
    snowdf_test.write.mode("overwrite").save_as_table("HOUSING_TEST")
    
    housing = snowdf_train.drop("MEDIAN_HOUSE_VALUE").to_pandas() # drop labels for training set
    housing_labels = snowdf_train.select("MEDIAN_HOUSE_VALUE").to_pandas()
    housing_test = snowdf_test.drop("MEDIAN_HOUSE_VALUE").to_pandas()
    housing_test_labels = snowdf_test.select("MEDIAN_HOUSE_VALUE").to_pandas()

    # numerical features
    housing_num = housing.drop("OCEAN_PROXIMITY", axis=1)
    # create a pipeline for numerical features
    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    num_attribs = list(housing_num)
    cat_attribs = ["OCEAN_PROXIMITY"]

    preprocessor = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs)
        ])

    full_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', RandomForestRegressor(n_estimators=100, random_state=42)),
        ])

    # fit the preprocessing pipeline and the model together
    full_pipeline.fit(housing, housing_labels)

    # save the full pipeline including the model
    save_file(session, full_pipeline, "@MODELS/housing_fores_reg.joblib")

    # predict on the test set and return the root mean squared error (RMSE)
    housing_predictions = full_pipeline.predict(housing_test)
    lin_mse = mean_squared_error(housing_test_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    return lin_rmse

# Create an instance of StoredProcedure using the sproc() function
train_model_sp = sproc(train_model, replace=True)

### Run the training within the SPROC

In [None]:
train_model_sp()

### Model/Pipeline Deployment 

Define the UDF that loads the pipeline

In [None]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
session.add_import("@MODELS/housing_fores_reg.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

features = ['LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
       'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME', 'OCEAN_PROXIMITY']

@udf(name="predict", is_permanent=True, stage_location="@udf", replace=True)
def predict(LONGITUDE: float, LATITUDE: float, HOUSING_MEDIAN_AGE: float, TOTAL_ROOMS: float, 
                    TOTAL_BEDROOMS: float, POPULATION: float, HOUSEHOLDS: float, MEDIAN_INCOME: float, 
                    OCEAN_PROXIMITY: str) -> float:
       m = read_file('housing_fores_reg.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

#### Run the UDF to make predictions over the test dataset

In [None]:
from snowflake.snowpark import functions as F

snowdf_test = session.table("HOUSING_TEST")
inputs = snowdf_test.drop("MEDIAN_HOUSE_VALUE")
snowdf_results = snowdf_test.select(*inputs,
                    predict(*inputs).alias('PREDICTION'), 
                    (F.col('MEDIAN_HOUSE_VALUE')).alias('ACTUAL_LABEL')
                    ).limit(20)
                    
snowdf_results.to_pandas().head(20)

### Using Vectorized UDFs For Optimal Performance

The code above runs the model in parallel but performs the predictions row by row. We can further improve it by using vectorized UDFs. Snowpark automatically splits up the rows and sends a batch to each UDF execution resulting in better throughput.

In [None]:
import pandas
import sys
import cachetools
import os
from snowflake.snowpark.functions import pandas_udf
from snowflake.snowpark import types as T

features = ['LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
       'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME', 'OCEAN_PROXIMITY']

session.add_import("@MODELS/housing_fores_reg.joblib")  
@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@pandas_udf(max_batch_size=100)
def predict_batch(df: T.PandasDataFrame[float, float, float, float,
                                          float, float, float, float, str]) -> T.PandasSeries[float]:
       m = read_file('housing_fores_reg.joblib') 
       df.columns = features
       return m.predict(df)

Now running the vectorized UDF

In [None]:
from snowflake.snowpark import functions as F

snowdf_test = session.table("HOUSING_TEST")
inputs = snowdf_test.drop("MEDIAN_HOUSE_VALUE")
snowdf_results = snowdf_test.select(*inputs,
                    predict_batch(*inputs).alias('PREDICTION'), 
                    (F.col('MEDIAN_HOUSE_VALUE')).alias('ACTUAL_LABEL')
                    ).limit(20)
                    
snowdf_results.to_pandas().head(20)