# Environment Setup

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.version import VERSION
import snowflake.snowpark as snp
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
import snowflake.snowpark.window as W

import pandas as pd
import json
import seaborn as sns

from snowflake.ml.modeling.lightgbm import LGBMClassifier
from snowflake.ml.modeling.metrics import *
from snowflake.ml.modeling.impute import SimpleImputer

## Create Snowpark Session

In [2]:
# Make a Snowpark Connection
# Edit the connection.json before creating the session object below
# Create Snowflake Session object

connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
#session.custom_package_usage_config = {"enabled": True}
#session.add_packages(["numpy==1.26.3"])

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))


Connection Established with the following parameters:
User                        : JOHN
Role                        : "ACCOUNTADMIN"
Database                    : "SCORED_MODEL"
Schema                      : "SCORED_MODEL"
Warehouse                   : "QUERY_WH"
Snowflake version           : 8.3.1
Snowpark for Python version : 1.11.1


## Feature Engineering

This notebook is based on the worksheet found here (https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart), with some modifications. The dataset has multiple values per customer ID, so we will undertake some feature engineering to find the mean, standard deviation, min, max and last values of each customer ID, so we have features and customer ID as a unique key.

In [3]:
#Create Snowpark df from shared data
training_df = session.table('"CC_DEFAULT_TRAINING_DATA"."DATA_SHARING_DEMO"."CC_DEFAULT_TRAINING_DATA"')

In [4]:
# Check number of rows (5,531,451 rows)
training_df.count()

5531451

In [5]:
# Check first 5 rows
training_df.show(5)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
num_cols = training_df.columns
num_cols.remove('"customer_ID"')
num_cols

['D_50',
 'D_51',
 'B_9',
 'R_3',
 'D_52',
 'P_3',
 'B_10',
 'D_53',
 'S_5',
 'B_11',
 'S_6',
 'D_54',
 'R_4',
 'S_7',
 'B_12',
 'S_8',
 'D_55',
 'D_56',
 'B_13',
 'R_5',
 'D_58',
 'S_9',
 'B_14',
 'D_59',
 'R_27',
 'B_38',
 'D_108',
 'D_109',
 'D_110',
 'D_111',
 'B_39',
 'D_112',
 'B_40',
 'S_27',
 'D_113',
 'D_114',
 'D_115',
 'D_116',
 'D_117',
 'D_118',
 'D_119',
 'D_120',
 'D_121',
 'D_122',
 'D_123',
 'D_124',
 'D_125',
 'D_126',
 'S_2',
 'P_2',
 'D_39',
 'B_1',
 'B_2',
 'R_1',
 'S_3',
 'D_41',
 'B_3',
 'D_42',
 'D_43',
 'D_44',
 'B_4',
 'D_45',
 'B_5',
 'R_2',
 'D_46',
 'D_47',
 'D_48',
 'D_49',
 'B_6',
 'B_7',
 'B_8',
 'S_15',
 'B_23',
 'D_73',
 'P_4',
 'D_74',
 'D_75',
 'D_76',
 'B_24',
 'R_7',
 'D_77',
 'B_25',
 'B_26',
 'D_78',
 'D_79',
 'R_8',
 'R_9',
 'S_16',
 'D_80',
 'R_10',
 'R_11',
 'B_27',
 'D_81',
 'D_82',
 'S_17',
 'R_12',
 'B_28',
 'R_13',
 'D_83',
 'R_14',
 'R_15',
 'D_84',
 'R_16',
 'B_29',
 'B_30',
 'S_18',
 'D_86',
 'D_87',
 'R_17',
 'R_18',
 'D_88',
 'B_31',


In [7]:
for col_name in num_cols:
    training_df = training_df.withColumn(col_name, F.col(col_name).cast(T.FloatType()))

For the next section, we are going to perform feature engineering create three groups of features:
- Selected features averaged over all statements of a customer
- The minimum or maximum of selected features over all statements of a customer
- Selected features taken from the last statement of a customer

In [8]:
# The following cells create the average numerical values for selected features per customer
features_avg = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_28', 'B_29', 'B_30', 'B_32', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_53', 'D_54', 'D_55', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 'D_84', 'D_86', 'D_91', 'D_92', 'D_94', 'D_96', 'D_103', 'D_104', 'D_108', 'D_112', 'D_113', 'D_114', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_129', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_14', 'R_15', 'R_16', 'R_17', 'R_20', 'R_21', 'R_22', 'R_24', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_18', 'S_22', 'S_23', 'S_25', 'S_26']

In [9]:
feat = [F.col(c) for c in features_avg]

In [10]:
exprs = {x: "avg" for x in features_avg}

In [11]:
df_avg = (training_df
          .groupBy('"customer_ID"')
          .agg(exprs)
          .rename({F.col(f"AVG({f})"): f"{f}_avg" for f in features_avg})
         )

In [12]:
# The following cells create the minimum numerical values for selected features per customer
features_min = ['B_2', 'B_4', 'B_5', 'B_9', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_19', 'B_20', 'B_28', 'B_29', 'B_33', 'B_36', 'B_42', 'D_39', 'D_41', 'D_42', 'D_45', 'D_46', 'D_48', 'D_50', 'D_51', 'D_53', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_62', 'D_70', 'D_71', 'D_74', 'D_75', 'D_78', 'D_83', 'D_102', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_128', 'D_132', 'D_140', 'D_141', 'D_144', 'D_145', 'P_2', 'P_3', 'R_1', 'R_27', 'S_3', 'S_5', 'S_7', 'S_9', 'S_11', 'S_12', 'S_23', 'S_25']

In [13]:
exprs_min = {x: "min" for x in features_min}

In [14]:
df_min = (training_df
          .groupBy('"customer_ID"')
          .agg(exprs_min)
          .rename({F.col(f"MIN({f})"): f"{f}_min" for f in features_min})
         )

In [15]:
df_min.count()

458913

In [16]:
# The following cells create the maximum numerical values for selected features per customer
features_max = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_21', 'B_23', 'B_24', 'B_25', 'B_29', 'B_30', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_52', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_63', 'D_64', 'D_65', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 'D_84', 'D_91', 'D_102', 'D_105', 'D_107', 'D_110', 'D_111', 'D_112', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_138', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_3', 'R_5', 'R_6', 'R_7', 'R_8', 'R_10', 'R_11', 'R_14', 'R_17', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_7', 'S_8', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']

In [17]:
exprs_max = {x: "max" for x in features_max}

In [18]:
df_max = (training_df
          .groupBy('"customer_ID"')
          .agg(exprs_max)
          .rename({F.col(f"MAX({f})"): f"{f}_max" for f in features_max})
         )

In [19]:
df_max.count()

458913

In [20]:
# The following cells create the last numerical values for selected features per customer, with S_2 being the date field.
features_last = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_28', 'B_29', 'B_30', 'B_32', 'B_33', 'B_36', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63', 'D_64', 'D_65', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_86', 'D_91', 'D_96', 'D_105', 'D_106', 'D_112', 'D_114', 'D_119', 'D_120', 'D_121', 'D_122', 'D_124', 'D_125', 'D_126', 'D_127', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_138', 'D_140', 'D_141', 'D_142', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_19', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_16', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', '"customer_ID"', '"target"']

In [21]:
w = snp.Window.partition_by('"customer_ID"').order_by(F.col('S_2').desc())

In [22]:
df_last = training_df.withColumn("rn", F.row_number().over(w)).filter("rn = 1").select(features_last)

In [23]:
df_last.count()

458913

In [24]:
# Now we join these dataframes on customer ID
feature_df = df_min.natural_join(df_avg)

In [25]:
feature_df = feature_df.natural_join(df_max)

In [26]:
feature_df = feature_df.natural_join(df_last)

In [27]:
feature_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
feat_cols = feature_df.columns
feat_cols.remove('"customer_ID"')
feat_cols

['B_2_MIN',
 'B_4_MIN',
 'B_5_MIN',
 'B_9_MIN',
 'B_13_MIN',
 'B_14_MIN',
 'B_15_MIN',
 'B_16_MIN',
 'B_17_MIN',
 'B_19_MIN',
 'B_20_MIN',
 'B_28_MIN',
 'B_29_MIN',
 'B_33_MIN',
 'B_36_MIN',
 'B_42_MIN',
 'D_39_MIN',
 'D_41_MIN',
 'D_42_MIN',
 'D_45_MIN',
 'D_46_MIN',
 'D_48_MIN',
 'D_50_MIN',
 'D_51_MIN',
 'D_53_MIN',
 'D_55_MIN',
 'D_56_MIN',
 'D_58_MIN',
 'D_59_MIN',
 'D_60_MIN',
 'D_62_MIN',
 'D_70_MIN',
 'D_71_MIN',
 'D_74_MIN',
 'D_75_MIN',
 'D_78_MIN',
 'D_83_MIN',
 'D_102_MIN',
 'D_112_MIN',
 'D_113_MIN',
 'D_115_MIN',
 'D_118_MIN',
 'D_119_MIN',
 'D_121_MIN',
 'D_122_MIN',
 'D_128_MIN',
 'D_132_MIN',
 'D_140_MIN',
 'D_141_MIN',
 'D_144_MIN',
 'D_145_MIN',
 'P_2_MIN',
 'P_3_MIN',
 'R_1_MIN',
 'R_27_MIN',
 'S_3_MIN',
 'S_5_MIN',
 'S_7_MIN',
 'S_9_MIN',
 'S_11_MIN',
 'S_12_MIN',
 'S_23_MIN',
 'S_25_MIN',
 'B_1_AVG',
 'B_2_AVG',
 'B_3_AVG',
 'B_4_AVG',
 'B_5_AVG',
 'B_6_AVG',
 'B_8_AVG',
 'B_9_AVG',
 'B_10_AVG',
 'B_11_AVG',
 'B_12_AVG',
 'B_13_AVG',
 'B_14_AVG',
 'B_15_AVG',
 'B_

In [29]:
# Imputation of Numeric Cols (maybe use larger WH)
my_imputer = SimpleImputer(input_cols= feat_cols,
                           output_cols= feat_cols,
                           strategy='most_frequent')
my_imputer.fit(feature_df)

<snowflake.ml.modeling.impute.simple_imputer.SimpleImputer at 0x1850757d0>

In [30]:
df_prepared = my_imputer.transform(feature_df)

In [31]:
df_prepared.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [32]:
# Split the data into train and test sets
train_df, test_df = df_prepared.random_split(weights=[0.9, 0.1], seed=0)

In [33]:
print((df_prepared.count(), len(df_prepared.columns)))
print((train_df.count(), len(train_df.columns)))
print((test_df.count(), len(test_df.columns)))

(458913, 471)
(413064, 471)
(45849, 471)


In [34]:
feat_cols.remove('"target"')
target_col = '"target"'

In [35]:
session.sql("USE DATABASE SCORED_MODEL").collect()

[Row(status='Statement executed successfully.')]

In [36]:
session.sql("USE SCHEMA SCORED_MODEL").collect()

[Row(status='Statement executed successfully.')]

In [37]:
session.sql("USE WAREHOUSE TRAINING_WH").collect()

[Row(status='Statement executed successfully.')]

The cell below trains the model with the Snowpark ML LGBM Classifier. We call the .fit function over the train_df we just created above

In [38]:
# Error TypeError: 'NoneType' object is not callable

lgbmodel = LGBMClassifier(
    input_cols=feat_cols, 
    label_cols=target_col, 
    output_cols='PREDICTION'
    )
lgbmodel.fit(train_df)

SnowparkSQLException: (1300) (1304): 01b1fc48-3202-3c58-0001-994e0001275e: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/home/udf/26824256/udf_py_2081044380.zip/udf_py_2081044380.py", line 76, in compute
    return func(session,arg1,arg2,arg3,arg4,arg5,arg6,arg7)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tbuchhorn/opt/anaconda3/envs/snowpark-ml-lightgbm/lib/python3.11/site-packages/snowflake/ml/modeling/_internal/snowpark_trainer.py", line 226, in fit_wrapper_function
  File "/usr/lib/python_udf/a5b0d72e35bafbb9825066cf2ff3d035a6bd4a07b789f6e132f80ad945a8c509/lib/python3.11/site-packages/lightgbm/sklearn.py", line 1096, in fit
    _LGBMAssertAllFinite(y)
TypeError: 'NoneType' object is not callable
 in function SNOWPARK_TEMP_PROCEDURE_6H4CC82XPD with handler udf_py_2081044380.compute

In [None]:
#Fix the fillna - maybe type float will help
scored_df = lgbmodel.predict(test_df)

In [None]:
print('Acccuracy:', accuracy_score(df=scored_df, y_true_col_names='"target"', y_pred_col_names='PREDICTION'))
print('Precision:', precision_score(df=scored_df, y_true_col_names='"target"', y_pred_col_names='PREDICTION'))
print('Recall:', recall_score(df=scored_df, y_true_col_names='"target"', y_pred_col_names='PREDICTION'))
print('F1:', f1_score(df=scored_df, y_true_col_names='"target"', y_pred_col_names='PREDICTION'))

# Obtaining and plotting a simple confusion matrix
cf_matrix = confusion_matrix(df=scored_df, y_true_col_name='"target"', y_pred_col_name='PREDICTION')

sns.heatmap(cf_matrix, annot=True, fmt='.0f', cmap='Blues')

We are happy with the metrics of the model above, so we can persist the scored trained dataset (if we want to use it later), and proceed to registering the model

In [None]:
scored_df.write.save_as_table(table_name='CC_DEFAULT_RISK_TEST_SCORED', mode='overwrite')
session.table('CC_DEFAULT_RISK_TEST_SCORED').show()

In [None]:
# Plot feature importance (only local)
feat_importance = pd.DataFrame(lgbmodel.feature_importances_,lgbmodel.feature_name_,columns=['FeatImportance'])
feat_importance.sort_values('FeatImportance').tail(10).plot.barh(y='FeatImportance', figsize=(5,15))

## Model Registration

Now we are happy with the mode we have trained, we can simply call the predict function on any new data that is sent from our client, the bank. Previously, we would have used UDFs to perform this task, but with ML on Snowpark, this has been greatly simplified. See more here: https://medium.com/snowflake/ml-on-snowflake-at-scale-with-snowpark-python-and-snowpark-ml-part-2-6491d72a9903

In [None]:
#If you do not have a model registry, create one
# Create a registry and log the model
registry = model_registry.ModelRegistry(session=session, database_name="MODEL_REGISTRY", create_if_not_exists=True)

In [None]:
from snowflake.ml.registry import model_registry

registry = model_registry.ModelRegistry(session=session
                                        , database_name="MODEL_REGISTRY")

model_id = registry.log_model(model=lgbmodel, model_name="cc_default_risk_model"
                              , model_version="1"
                              , tags={"stage": "testing"
                                 , "classifier_type": "lightgbm.LGBMClassifier"})

model = model_registry.ModelReference(registry=registry
                                      , model_name="cc_default_risk_model"
                                      , model_version="1")

#model.set_metric(metric_name="mean_squared_error", metric_value=mse)
#model.set_metric(metric_name="mean_absolute_error", metric_value=mab)
#model.set_metric(metric_name="r2_score", metric_value=r2)

In [None]:
registry.set_metric(model_name="cc_default_risk_model", model_version="1", metric_name="accuracy_score", metric_value=accuracy_score)

In [None]:
# Let's confirm it was added
registry.list_models().to_pandas()

In [None]:
# Pick a deployment name and deploy
model_name = "cc_default_risk_model"
model_version = "1"
model_deployment_name = model_name + f"{model_version}" + "_UDF"

registry.deploy(model_name=model_name,
                model_version=model_version,
                deployment_name=model_deployment_name, 
                target_method="predict", 
                permanent=True, 
                options={"relax_version": True})

## Alternate Model Deployment
The alternative is to deploy via a vectorized UDF

In [None]:
# Create a stage for the model if it doesnt exist
session.sql("CREATE STAGE IF NOT EXISTS MODEL_ASSETS").collect()

In [None]:
# For this method, we convert the model to a local one, so we can stage it and deploy to Snowflake
lgbmodel_local = lgbmodel.to_lightgbm()

In [None]:
import joblib

# Let's save our model first
MODEL_FILE = 'model.joblib'
joblib.dump(lgbmodel_local, MODEL_FILE) # we are just pickling it locally first

# You can also save the pickled object into the stage we created earlier
session.file.put(MODEL_FILE, "@MODEL_ASSETS", overwrite=True)

In [None]:
import cachetools
from snowflake.snowpark.functions import udf
# Cache the model load to optimize inference
@cachetools.cached(cache={})
def load_model(filename):
    import joblib
    import sys
    import os

    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]

    if import_dir:
        with open(os.path.join(import_dir, filename), 'rb') as file:
            m = joblib.load(file)
            return m

# Register the UDF via decorator
@udf(name='batch_predict_cc_default', 
     session=session, 
     replace=True, 
     is_permanent=True, 
     stage_location='@MODEL_ASSETS',
     input_types=[F.FloatType()]*len(feature_cols),
     return_type=F.FloatType(),
     imports=['@MODEL_ASSETS/model.joblib.gz'],
     packages=['pandas','joblib','cachetools','lightgbm', 'numpy', 'scikit-learn'])
def batch_predict_cc_default(test_df: pd.DataFrame) -> pd.Series:
    # Need to name the columns because column names aren't passed in to this function
    test_df.columns = feature_cols
    model = load_model('model.joblib.gz')
    return model.predict(test_df) # This is using the XGBoost library's model.predict(), not Snowpark ML's

In [None]:
from snowflake.snowpark.functions import call_udf
scored_data = test_df.select('"customer_ID"', call_udf("batch_predict_cc_default", [F.col(c) for c in feature_cols]).alias('Prediction'))

In [None]:
scored_data.write.save_as_table(table_name='UDF_TEST_SCORED', mode='overwrite')

In [None]:
test_df_w_preds = test_df.with_column('PREDICTION', batch_predict_cc_default(*feature_cols))
test_df_w_preds.show()

In [None]:
# Test for accuracy
print('Acccuracy:', accuracy_score(df=test_df_w_preds, y_true_col_names='"target"', y_pred_col_names='PREDICTION'))

Now we have a UDF deployed in our Snowflake account called "batch_predict_cc_default" that we can use on incoming data.

In [None]:
# FEATURE ENGINEERING LOGIC
import snowflake.snowpark
from snowflake.snowpark.functions import sproc

session.add_packages('snowflake-snowpark-python')

def feature_transform(session: snowflake.snowpark.Session, raw_table: str, output_table: str) -> str:
    training_df = session.table(raw_table)
    # Feature engineer raw input
    #Average
    features_avg = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_28', 'B_29', 'B_30', 'B_32', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_53', 'D_54', 'D_55', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 'D_84', 'D_86', 'D_91', 'D_92', 'D_94', 'D_96', 'D_103', 'D_104', 'D_108', 'D_112', 'D_113', 'D_114', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_129', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_14', 'R_15', 'R_16', 'R_17', 'R_20', 'R_21', 'R_22', 'R_24', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_18', 'S_22', 'S_23', 'S_25', 'S_26']
    feat = [F.col(c) for c in features_avg]
    exprs = {x: "avg" for x in features_avg}
    df_avg = (training_df
          .groupBy('"customer_ID"')
          .agg(exprs)
          .rename({F.col(f"AVG({f})"): f"{f}_avg" for f in features_avg})
         )
    
    # Minimum
    features_min = ['B_2', 'B_4', 'B_5', 'B_9', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_19', 'B_20', 'B_28', 'B_29', 'B_33', 'B_36', 'B_42', 'D_39', 'D_41', 'D_42', 'D_45', 'D_46', 'D_48', 'D_50', 'D_51', 'D_53', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_62', 'D_70', 'D_71', 'D_74', 'D_75', 'D_78', 'D_83', 'D_102', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_128', 'D_132', 'D_140', 'D_141', 'D_144', 'D_145', 'P_2', 'P_3', 'R_1', 'R_27', 'S_3', 'S_5', 'S_7', 'S_9', 'S_11', 'S_12', 'S_23', 'S_25']
    exprs_min = {x: "min" for x in features_min}
    df_min = (training_df
          .groupBy('"customer_ID"')
          .agg(exprs_min)
          .rename({F.col(f"MIN({f})"): f"{f}_min" for f in features_min})
         )
    
    # Maximum
    features_max = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_21', 'B_23', 'B_24', 'B_25', 'B_29', 'B_30', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_52', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_63', 'D_64', 'D_65', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 'D_84', 'D_91', 'D_102', 'D_105', 'D_107', 'D_110', 'D_111', 'D_112', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_138', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_3', 'R_5', 'R_6', 'R_7', 'R_8', 'R_10', 'R_11', 'R_14', 'R_17', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_7', 'S_8', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']
    exprs_max = {x: "max" for x in features_max}
    df_max = (training_df
          .groupBy('"customer_ID"')
          .agg(exprs_max)
          .rename({F.col(f"MAX({f})"): f"{f}_max" for f in features_max})
         )
    
    # Last
    features_last = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_28', 'B_29', 'B_30', 'B_32', 'B_33', 'B_36', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63', 'D_64', 'D_65', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_86', 'D_91', 'D_96', 'D_105', 'D_106', 'D_112', 'D_114', 'D_119', 'D_120', 'D_121', 'D_122', 'D_124', 'D_125', 'D_126', 'D_127', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_138', 'D_140', 'D_141', 'D_142', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_19', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_16', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', '"customer_ID"']
    w = snp.Window.partition_by('"customer_ID"').order_by(F.col('S_2').desc())
    df_last = training_df.withColumn("rn", F.row_number().over(w)).filter("rn = 1").select(features_last)
    
    # Join
    feature_df = df_min.natural_join(df_avg)
    feature_df = feature_df.natural_join(df_max)
    feature_df = feature_df.natural_join(df_last)
    
    feature_df.write.save_as_table(output_table, mode="append")
    
    return "Success"

In [None]:
session.sproc.register(feature_transform, name="feature_transform", replace=True, is_permanent=True, stage_location="@MODEL_ASSETS", packages=['snowflake-snowpark-python'], return_type = StringType())

In [None]:
session.call("feature_transform", "CC_DEFAULT_TRAINING_DATA", "TRANSFORMED_TABLE2")