## Imports

In [45]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField

import pandas as pd

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

import json

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [46]:
my_dir = os.getcwd()
connection_parameters = json.load(open(f'/{my_dir}/creds.json'))
session = Session.builder.configs(connection_parameters).create()

# Train ML Model

In [47]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "MEDIUM"').collect()

[Row(status='Statement executed successfully.')]

In [48]:
session.use_database('ML_SNOWPARK_CI_CD')
session.use_schema('DATA_PROCESSING')

# ML Modeling

In [6]:
# Prepare Data for modeling
snowdf_train = session.table('CREDIT_DEFAULT_TRAIN')
feature_cols = snowdf_train.columns
feature_cols.remove('TARGET')
target_col = 'TARGET'

In [7]:
# Define the XGBClassifier and fit the model
xgbmodel = XGBClassifier(random_state=123, input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')
xgbmodel.fit(snowdf_train)

<snowflake.ml.modeling.xgboost.xgb_classifier.XGBClassifier at 0x16bf4c130>

In [8]:
# Score the data using the fitted xgbmodel
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
scored_sdf = xgbmodel.predict(snowdf_test)

In [16]:
test_df = scored_sdf.limit(5).to_pandas()

# Deploying Model for Future Use

Steps to follow-
1. Get model in your local environment
2. Save the file in your local env. as .joblib file
3. Upload the file to Snowflake stage
4. Create UDF using model in stage

We can use `to_xgboost()` in order to get the actual xgboost model object which gives us access to all its attributes.

In [35]:
import joblib
import cachetools

### Step 1 : Get model in your local environment

In [36]:
# We can use to_xgboost() in order to get the actual xgboost model object which gives us access to all its attributes.
xgb_file = xgbmodel.to_xgboost()
xgb_file

### Step 2 : Save the file in your local env. as .joblib file

In [37]:
MODEL_FILE = 'model.joblib.gz'
joblib.dump(xgb_file, MODEL_FILE) # we are just pickling it locally first

['model.joblib.gz']

### Step 3 : Upload the file to Snowflake stage

In [39]:
session.sql('CREATE STAGE IF NOT EXISTS ML_SNOWPARK_CI_CD.ML_PROCESSING.ML_MODELS').collect()

[Row(status='Stage area ML_MODELS successfully created.')]

In [41]:
session.file.put(MODEL_FILE, "@ML_PROCESSING.ML_MODELS", auto_compress=False, overwrite=True)

[PutResult(source='model.joblib.gz', target='model.joblib.gz', source_size=82544, target_size=82560, source_compression='GZIP', target_compression='GZIP', status='UPLOADED', message='')]

### Step 4 : Create UDF using model in stage

In [42]:
session.sql('USE SCHEMA ML_SNOWPARK_CI_CD.ML_PROCESSING').collect()

[Row(status='Statement executed successfully.')]

In [43]:
# Define a simple scoring function
from cachetools import cached

@cached(cache={})
def load_model(model_path: str) -> object:
    from joblib import load
    model = load(model_path)
    return model

def udf_score_xgboost_model_vec_cached(df: pd.DataFrame) -> pd.Series:
    import os
    import sys
    # file-dependencies of UDFs are available in snowflake_import_directory
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    model_name = 'model.joblib.gz'
    model = load_model(import_dir+model_name)
    df.columns = feature_cols
    scored_data = pd.Series(model.predict(df))
    return scored_data

In [44]:
# Register UDF
udf_clv = session.udf.register(func=udf_score_xgboost_model_vec_cached, 
                               name="PREDICT_DEFAULT", 
                               stage_location='@ML_MODELS',
                               input_types=[T.FloatType()]*len(feature_cols),
                               return_type = T.FloatType(),
                               replace=True, 
                               is_permanent=True, 
                               imports=['@ML_MODELS/model.joblib.gz'],
                               packages=['pandas',
                                         'xgboost',
                                         'joblib',
                                         'cachetools'], 
                               session=session)

# Wrap-up

In [55]:
!pip freeze > requirements.txt

In [None]:
session.close()