In [1]:
# Base dir for this code
import os
base_dir = os.getcwd()
print(base_dir)

/Users/skhara/Documents/GitHub/snowpark-python-demos/tpcds-customer-lifetime-value


In [3]:
import json
import pandas as pd

from snowflake.snowpark import functions as F
from snowflake.snowpark import version as v
from snowflake.snowpark.session import Session

from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

In [4]:
import warnings
warnings.filterwarnings("ignore")

# 1.0 Snowflake Setup

In [8]:
# Ensure that your credentials are stored in creds.json
with open('creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    PASSWORD = data['password']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']

CONNECTION_PARAMETERS = {
   "account": SF_ACCOUNT,
   "user": USERNAME,
   "password": PASSWORD,
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()

#### Ensure that TPC-DS dataset is available in your environment.

In [8]:
session.sql('CREATE DATABASE IF NOT EXISTS tpcds_xgboost').collect()
session.sql('CREATE SCHEMA IF NOT EXISTS tpcds_xgboost.demo').collect()
session.sql("create or replace warehouse FE_AND_INFERENCE_WH with warehouse_size='3X-LARGE'").collect()
session.sql("create or replace warehouse snowpark_opt_wh with warehouse_size = 'MEDIUM' warehouse_type = 'SNOWPARK-OPTIMIZED'").collect()
session.sql("alter warehouse snowpark_opt_wh set max_concurrency_level = 1").collect()
session.sql("CREATE OR REPLACE STAGE TPCDS_XGBOOST.DEMO.ML_MODELS").collect()
session.use_warehouse('FE_AND_INFERENCE_WH')
session.use_database('tpcds_xgboost')
session.use_schema('demo')

Select either 100 or 10 for the TPC-DS Dataset size to use below. See (https://docs.snowflake.com/en/user-guide/sample-data-tpcds.html)[here] for more information If you choose 100, I recommend >= 3XL warehouse. 

In [9]:
TPCDS_SIZE_PARAM = 10
SNOWFLAKE_SAMPLE_DB = 'SFSALESSHARED_SFC_SAMPLES_PROD3_SAMPLE_DATA' # Name of Snowflake Sample Database might be different...

if TPCDS_SIZE_PARAM == 100: 
    TPCDS_SCHEMA = 'TPCDS_SF100TCL'
elif TPCDS_SIZE_PARAM == 10:
    TPCDS_SCHEMA = 'TPCDS_SF10TCL'
else:
    raise ValueError("Invalid TPCDS_SIZE_PARAM selection")
    
store_sales = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.store_sales')
catalog_sales = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.catalog_sales') 
web_sales = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.web_sales') 
date = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.date_dim')
dim_stores = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.store')
customer = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.customer')
address = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.customer_address')
demo = session.table(f'{SNOWFLAKE_SAMPLE_DB}.{TPCDS_SCHEMA}.customer_demographics')

# 2.0 Data Engineering
We will aggregate sales by customer across all channels(web, store, catalogue) and join that to customer demographic data. 

In [10]:
store_sales_agged = store_sales.group_by('ss_customer_sk').agg(F.sum('ss_sales_price').as_('total_sales'))
web_sales_agged = web_sales.group_by('ws_bill_customer_sk').agg(F.sum('ws_sales_price').as_('total_sales'))
catalog_sales_agged = catalog_sales.group_by('cs_bill_customer_sk').agg(F.sum('cs_sales_price').as_('total_sales'))
store_sales_agged = store_sales_agged.rename('ss_customer_sk', 'customer_sk')
web_sales_agged = web_sales_agged.rename('ws_bill_customer_sk', 'customer_sk')
catalog_sales_agged = catalog_sales_agged.rename('cs_bill_customer_sk', 'customer_sk')

In [11]:
total_sales = store_sales_agged.union_all(web_sales_agged)
total_sales = total_sales.union_all(catalog_sales_agged)

In [12]:
total_sales = total_sales.group_by('customer_sk').agg(F.sum('total_sales').as_('total_sales'))

In [13]:
customer = customer.select('c_customer_sk','c_current_hdemo_sk', 'c_current_addr_sk', 'c_customer_id', 'c_birth_year')

In [14]:
customer = customer.join(address.select('ca_address_sk', 'ca_zip'), customer['c_current_addr_sk'] == address['ca_address_sk'] )
customer = customer.join(demo.select('cd_demo_sk', 'cd_gender', 'cd_marital_status', 'cd_credit_rating', 'cd_education_status', 'cd_dep_count'),
                                customer['c_current_hdemo_sk'] == demo['cd_demo_sk'] )
customer = customer.rename('c_customer_sk', 'customer_sk')

In [15]:
customer.limit(5).to_pandas()

Unnamed: 0,CUSTOMER_SK,C_CURRENT_HDEMO_SK,C_CURRENT_ADDR_SK,C_CUSTOMER_ID,C_BIRTH_YEAR,CA_ADDRESS_SK,CA_ZIP,CD_DEMO_SK,CD_GENDER,CD_MARITAL_STATUS,CD_CREDIT_RATING,CD_EDUCATION_STATUS,CD_DEP_COUNT
0,47565134,2903,11432972,AAAAAAAAOEJMFNCA,1990,11432972,68371,2903,M,S,High Risk,2 yr Degree,0
1,47565135,2457,29478386,AAAAAAAAPEJMFNCA,1966,29478386,70499,2457,M,W,Low Risk,Primary,0
2,47565136,450,23602579,AAAAAAAAAFJMFNCA,1934,23602579,11952,450,F,U,Good,College,0
3,47565137,1315,616770,AAAAAAAABFJMFNCA,1969,616770,54593,1315,M,D,Good,Advanced Degree,0
4,47565138,2064,3437061,AAAAAAAACFJMFNCA,1974,3437061,99310,2064,F,S,Low Risk,2 yr Degree,0


In [16]:
final_df = total_sales.join(customer, on='customer_sk')

In [17]:
# Size of the final DF is around 95 Million.
final_df.count()

62726989

In [18]:
session.use_database('tpcds_xgboost')
session.use_schema('demo')
final_df.write.mode('overwrite').save_as_table('feature_store')

# 3.0 Feature Engineering

In [19]:
session.use_warehouse('snowpark_opt_wh')
session.use_database('tpcds_xgboost')
session.use_schema('demo')

In [20]:
snowdf = session.table("feature_store")
snowdf = snowdf.drop(['CA_ZIP','CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

In [21]:
snowdf.limit(5).to_pandas()

Unnamed: 0,TOTAL_SALES,C_BIRTH_YEAR,CD_GENDER,CD_MARITAL_STATUS,CD_CREDIT_RATING,CD_EDUCATION_STATUS,CD_DEP_COUNT
0,30105.64,1951,F,D,Good,2 yr Degree,0
1,30212.63,1964,F,D,Good,2 yr Degree,0
2,32083.35,1970,F,D,Good,2 yr Degree,0
3,30116.84,1989,F,D,Good,2 yr Degree,0
4,27830.58,1981,F,D,Good,2 yr Degree,0


In [22]:
cat_cols = ['CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

### 3.1 Missing Value Imputation

We can use the SimpleImputer in snowflake.ml.preprocessing to replace missing values with the most frequent.

```python
# SimpleImputer in snowflake.ml.preprocessing
from snowflake.ml.modeling.impute import SimpleImputer
my_imputer = sfml.preprocessing.SimpleImputer(input_cols=['your_column'],
                                output_cols=['your_column'],
                                strategy='constant',
                                fill_value='OTHER')
my_imputer.fit(my_sdf)
my_sdf = my_imputer.transform(my_sdf)
```

In [23]:
# Imputation of Numeric Cols
my_imputer = SimpleImputer(input_cols= num_cols,
                           output_cols= num_cols,
                           strategy='median')
sdf_prepared = my_imputer.fit(snowdf).transform(snowdf)

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "C_BIRTH_YEAR", Type: LongType(), Input Value: 1958.0, Type: <class 'float'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "CD_DEP_COUNT", Type: LongType(), Input Value: 0.0, Type: <class 'float'>


### 3.2 One-Hot Encoding of Categorical Cols

In [24]:
# OHE of Categorical Cols
my_ohe_encoder = OneHotEncoder(input_cols=cat_cols, output_cols=cat_cols, drop_input_cols=True)
sdf_prepared = my_ohe_encoder.fit(sdf_prepared).transform(sdf_prepared)

In [25]:
sdf_prepared.limit(5).to_pandas()

Unnamed: 0,CD_GENDER_F,CD_GENDER_M,CD_MARITAL_STATUS_D,CD_MARITAL_STATUS_M,CD_MARITAL_STATUS_S,CD_MARITAL_STATUS_U,CD_MARITAL_STATUS_W,CD_CREDIT_RATING_Good,CD_CREDIT_RATING_High Risk,CD_CREDIT_RATING_Low Risk,...,CD_EDUCATION_STATUS_2 yr Degree,CD_EDUCATION_STATUS_4 yr Degree,CD_EDUCATION_STATUS_Advanced Degree,CD_EDUCATION_STATUS_College,CD_EDUCATION_STATUS_Primary,CD_EDUCATION_STATUS_Secondary,CD_EDUCATION_STATUS_Unknown,C_BIRTH_YEAR,CD_DEP_COUNT,TOTAL_SALES
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1951,0,30105.64
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1964,0,30212.63
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1970,0,32083.35
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1989,0,30116.84
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1981,0,27830.58


### 3.3 Clean column names

In [27]:
# Cleaning column names to make it easier for future referencing
import re

cols = sdf_prepared.columns
for old_col in cols:
    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
    new_col = new_col.upper()
    sdf_prepared = sdf_prepared.rename(F.col(old_col), new_col)

In [30]:
# Save the train and test sets as time stamped tables in Snowflake
snowdf_train, snowdf_test = sdf_prepared.random_split([0.8, 0.2], seed=82)
snowdf_train.fillna(0).write.mode("overwrite").save_as_table("tpcds_xgboost.demo.TPC_TRAIN")
snowdf_test.fillna(0).write.mode("overwrite").save_as_table("tpcds_xgboost.demo.TPC_TEST")

# 4.0 ML Modeling
1. Train model using SnowparkML API: https://docs.snowflake.com/developer-guide/snowpark-ml/snowpark-ml-modeling
2. Log model in Snowflake Model Registry: https://docs.snowflake.com/en/developer-guide/snowpark-ml/snowpark-ml-mlops-model-registry

In [28]:
# Since model training is memory extensive, use Snowpark Optimized Warehouse for model training - you get 16x the memory.
session.use_warehouse('snowpark_opt_wh')

In [34]:
# Initialize model registry object in Snowflake
from snowflake.ml.registry import registry
native_registry = registry.Registry(session=session, database_name='tpcds_xgboost', schema_name='demo')

### 4.1 Get data
Using snowpark dataframes for zero memory footprint on local machine

In [31]:
snowdf_train = session.table('tpcds_xgboost.demo.TPC_TRAIN')

In [32]:
# Prepare Data for modeling
feature_cols = snowdf_train.columns
feature_cols.remove('TOTAL_SALES')
target_col = 'TOTAL_SALES'

### 4.2 Initialize Model and Fit

In [33]:
# Define the XGBRegressor and fit the model
xgbmodel = XGBRegressor(n_estimators = 100, random_state=123, max_depth = 3, input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')
xgbmodel.fit(snowdf_train)

The version of package 'snowflake-snowpark-python' in the local environment is 1.12.1, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


<snowflake.ml.modeling.xgboost.xgb_regressor.XGBRegressor at 0x28e79e8e0>

### 4.3 Predict on a small test set

In [35]:
# Score the data using the fitted xgbmodel
snowdf_test = session.table('tpcds_xgboost.demo.TPC_TEST').limit(1000)
sdf_scored = xgbmodel.predict(snowdf_test)

In [36]:
sdf_scored.limit(5).to_pandas()

Unnamed: 0,CD_MARITAL_STATUS_S,CD_CREDIT_RATING_GOOD,CD_EDUCATION_STATUS_COLLEGE,TOTAL_SALES,CD_DEP_COUNT,CD_CREDIT_RATING_LOWRISK,CD_EDUCATION_STATUS_SECONDARY,CD_MARITAL_STATUS_W,CD_GENDER_M,CD_MARITAL_STATUS_U,...,CD_EDUCATION_STATUS_4YRDEGREE,CD_GENDER_F,CD_EDUCATION_STATUS_2YRDEGREE,CD_EDUCATION_STATUS_PRIMARY,CD_MARITAL_STATUS_M,C_BIRTH_YEAR,CD_CREDIT_RATING_HIGHRISK,CD_EDUCATION_STATUS_ADVANCEDDEGREE,CD_CREDIT_RATING_UNKNOWN,PREDICTION
0,0.0,1.0,0.0,33145.34,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1969.0,0.0,0.0,0.0,32331.027344
1,0.0,1.0,0.0,30997.89,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,,0.0,0.0,0.0,32311.972656
2,0.0,1.0,0.0,29971.8,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1987.0,0.0,0.0,0.0,32345.794922
3,0.0,1.0,0.0,31843.34,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1933.0,0.0,0.0,0.0,32341.648438
4,0.0,1.0,0.0,40826.21,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1927.0,0.0,0.0,0.0,32347.556641


### 4.4 Log Model in Snowflake Model Registry
You can reference this model in a separate workflow without training it again.

In [None]:
# Define model name
model_name = "DEMO_TPCDS"
model_version = f"V1_{pd.datetime.now().strftime('%Y_%m_%d')}"

# Let's log the best model trained
model_ver = native_registry.log_model(
    model_name= model_name,
    version_name= model_version,
    model= xgbmodel,
    comment= "Wasn't this super easy?"
)

In [42]:
native_registry.show_models()

Unnamed: 0,created_on,name,database_name,schema_name,comment,owner,default_version_name,versions
0,2024-03-04 14:06:15.643000-08:00,DEMO_TPCDS,TPCDS_XGBOOST,DEMO,,ACCOUNTADMIN,V4_2024_03_04,"[""V1_2024_03_04"",""V4_2024_03_04""]"


# 5.0 Using Model Registry for Inference
The Snowpark Model Registry stores machine learning models as first-class schema-level objects in Snowflake so they can easily be found and used by others in your organization.

Once you have stored a model, you can invoke its methods (equivalent to functions or stored procedures) to perform model operations, such as inference, in a Snowflake virtual warehouse.

### 5.1 View all model versions

In [44]:
model_name = "DEMO_TPCDS"
native_registry.get_model(model_name).show_versions()

Unnamed: 0,created_on,name,comment,database_name,schema_name,module_name,is_default_version,functions,metadata,user_data
0,2024-03-04 14:06:15.841000-08:00,V4_2024_03_04,,TPCDS_XGBOOST,DEMO,DEMO_TPCDS,True,"[""PREDICT""]",{},"{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
1,2024-03-04 14:07:00.605000-08:00,V1_2024_03_04,Wasn't this super easy?,TPCDS_XGBOOST,DEMO,DEMO_TPCDS,False,"[""PREDICT""]",{},"{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."


### 5.2 Get default version

In [45]:
model = native_registry.get_model(model_name).default

### 5.3 Load Data and Run Inference

In [48]:
snowdf_test = session.table('tpcds_xgboost.demo.TPC_TEST')
result_sdf = model_ver.run(snowdf_test, function_name="predict")



### 5.4 Save predictions in Snowflake

In [49]:
session.use_database('tpcds_xgboost')
session.use_schema('demo')
result_sdf.write.mode('overwrite').save_as_table('predictions')

# 6.0 Wrap up

In [7]:
session.close()