In [1]:
# Import Python packages
import pandas as pd
import plotly.express as px
import json
import sys
import cachetools

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window

In [2]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [4]:
# Create stage
session.sql("CREATE STAGE IF NOT EXISTS model_stage").collect()

[Row(status='Stage area MODEL_STAGE successfully created.')]

In [23]:
train_table = session.table(name="train_table")

In [3]:
def train_xgb(
    session: Session,
    training_table: str,
    feature_cols: list,
    target_col: str,
    model_name: str,
) -> T.Variant:

    # Import packages
    from xgboost import XGBClassifier
    from joblib import dump
     # Get training data
    df = session.table(training_table).to_pandas()

    # Set inputs X and outputs y
    X = df[feature_cols]
    y = df[target_col]

    # Train model
    xgb_improved = XGBClassifier(learning_rate= 0.1, max_depth= 4, n_estimators= 20, 
                                 n_jobs= -1, objective= 'binary:logistic')
    # Fit the model
    model = xgb_improved.fit(X,y)

    # Get feature weights
    feature_weights = pd.DataFrame({"Feature": model.feature_names_in_, "Weight": model.coef_}).to_dict()

    # Save model
    dump(model, "/tmp/" + model_name)
    session.file.put(
        "/tmp/" + model_name,
        "@MODEL_STAGE",
        auto_compress=False,
        overwrite=True
    )

    # Return feature contributions
    return feature_weights


In [13]:
train_linreg_snowflake = session.sproc.register(
    func=train_xgb,
    name="sproc_train_xgb",
    is_permanent=True,
    replace=True,
    stage_location="@MODEL_STAGE",
    packages=["snowflake-snowpark-python", "xgboost", "joblib"]
)


The version of package xgboost in the local environment is 1.7.5, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment
The version of package joblib in the local environment is 1.2.0, which does not fit the criteria for the requirement joblib. Your UDF might not work when the package version is different between the server and your local environment


In [25]:
# Specify inputs
training_table = "train_table"
feature_cols = train_table.drop("NextPurchaseDayRange").columns
target_col = "NextPurchaseDayRange"
model_name = "xgb_churn_model.sav"

# Call the training stored procedure
feature_contributions = train_linreg_snowflake(
    session,
    training_table,
    feature_cols,
    target_col,
    model_name
)   

SnowparkSQLException: (1304): 01ac73c6-3200-bd9b-0003-f86a00086102: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "_udf_code.py", line 49, in compute
  File "C:\Users\Ryan Liam\AppData\Local\Temp\ipykernel_25756\238998895.py", line 16, in train_xgb
  File "/usr/lib/python_udf/63cfd480848581e15297c24bba788a10ff504907aa8cc549860afcce53796090/lib/python3.8/site-packages/pandas/core/frame.py", line 3813, in __getitem__
    indexer = self.columns._get_indexer_strict(key, "columns")[1]
  File "/usr/lib/python_udf/63cfd480848581e15297c24bba788a10ff504907aa8cc549860afcce53796090/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 6070, in _get_indexer_strict
    self._raise_if_missing(keyarr, indexer, axis_name)
  File "/usr/lib/python_udf/63cfd480848581e15297c24bba788a10ff504907aa8cc549860afcce53796090/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 6133, in _raise_if_missing
    raise KeyError(f"{not_found} not in index")
KeyError: '[\'"GENDER_Male"\', \'"GENDER_Female"\', \'"MARITAL_STATUS_Single"\', \'"MARITAL_STATUS_Divorced/Seperated"\', \'"MARITAL_STATUS_Married"\', \'"NextPurchaseDayRange"\'] not in index'
 in function SPROC_TRAIN_XGB with handler compute

In [None]:
pd.DataFrame(session.sql("LIST @MODEL_STAGE").collect())