### The training is assumed to be done outside the scope of the app
We'll used the trained model object in the native app

In [6]:
import pandas as pd
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf,  sproc
from snowflake.snowpark.row import Row
from snowflake.snowpark.types import PandasDataFrame, PandasSeries
import os
import pickle

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


import configparser

In [7]:
config = configparser.ConfigParser()
config.read('../sf_account.config')
session = Session.builder.configs(dict(config['DEFAULT'])).create()  


In [8]:
session.sql("use database CUSTOMER_DB").collect()
session.sql("use schema CUSTOMER_DB.CHURN").collect()
session.sql("create stage if not exists CUSTOMER_DB.CHURN.MODELS").collect()

[Row(status='MODELS already exists, statement succeeded.')]

In [9]:
training_table = "CUSTOMER_DB.CHURN.TELCO_LABELED"

In [10]:
parameters = {"features":
             [
                "MONTHLY CHARGES",
                 "TOTAL CHARGES",
             ],
             "target": "CHURN VALUE",
             "test_size": 0.2,
             "random_state": 666}

trained_model_path = "@MODELS/regressor.pkl"

In [62]:
# Train model in Snowflake with a sproc




@sproc(name = "simple_churn_training",
       is_permanent=True, 
       stage_location = "@MODELS",
       replace = True,
       packages = ['pandas',
                   'scikit-learn',
                   'snowflake-snowpark-python']
      )
def handler(session: Session, 
          input_table: str,
          trained_model_path: str,
          parameters: dict) -> dict:
    """
    All in one training function for simple churn prediction
    This will create a model in a stage that can be used later
    """
    
    # parse out names
    db_name = input_table.split(".")[0]
    schema_name = input_table.split(".")[1]
    
    
    #data = session.table(input_table).collect()
    row_objs = session.table(input_table).collect()
    dict_array = list(map(Row.as_dict, row_objs))
    data = pd.DataFrame(dict_array)
    
    
    X = data[parameters["features"]]
    #X = data[["MONTHLY CHARGES", "TOTAL CHARGES"]]
    y = data[parameters["target"]]
    #y = data[["CHURN VALUE"]]
    
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, test_size = parameters['test_size'], random_state = parameters['random_state']
    )
    
    
    # ---  Encode features  ---
    # TODO - to use more features
    
    
    # ---  train  ---
    
    regressor = sklearn.ensemble.RandomForestRegressor()
    regressor.fit(X_train, y_train)
    
    
    # ---  save trained model  ---
    trained_model_name = trained_model_path.split("/")[-1].replace(" ", "")
    trained_model_stage = trained_model_path.rsplit("/", 1)[0]    
    
    model_path = './tmp'
    
    if not os.path.exists(model_path):
        os.makedirs(model_path)
        
    model_file = os.path.join(model_path, trained_model_name)
    with open(model_file, 'wb') as pickle_file:
        pickle.dump(regressor, pickle_file)
        
    put_result = session.file.put(model_file, 
                                  trained_model_stage,
                                  overwrite=True,
                                  auto_compress=False)
    
    print(put_result)
    
    y_pred = regressor.predict(X_test)
    
    score = sklearn.metrics.r2_score(y_test, y_pred)
    
    return {
        "put_status": put_result,
        "r2_score": score
    }
    

    



In [63]:
# test proc

call_proc = f"""call simple_churn_training('{training_table}', '{trained_model_path}', {parameters}) """
print(call_proc)
session.sql(call_proc).collect()

call simple_churn_training('CUSTOMER_DB.CHURN.TELCO_LABELED', '@MODELS/regressor.pkl', {'features': ['MONTHLY CHARGES', 'TOTAL CHARGES'], 'target': 'CHURN VALUE', 'test_size': 0.2, 'random_state': 666}) 


[Row(SIMPLE_CHURN_TRAINING='{\n  "put_status": [\n    [\n      "regressor.pkl",\n      "regressor.pkl",\n      24038455,\n      24038464,\n      "NONE",\n      "NONE",\n      "UPLOADED",\n      ""\n    ]\n  ],\n  "r2_score": 0.9648660683409334\n}')]

In [14]:
# Test the model udf
# (to be created in the app)
model_filename = trained_model_path.split("/")[-1].replace(" ", "")
model_get_results = session.file.get(trained_model_path, "./tmp")

with open(f'./tmp/{model_filename}', 'rb') as model_file:
    regressor = pickle.load(model_file)

    
@udf(name = 'predict_churn', 
    is_permanent = True,
    stage_location="@MODELS",
    replace=True,
    packages = ['pandas',
                   'scikit-learn',
                   'snowflake-snowpark-python'],
     session = session
    )
def handler(df: PandasDataFrame[float, float]) -> PandasSeries[float]:
    # assumes monthly charges first, total chargest second
    df.columns = ["MONTHLY CHARGES", "TOTAL CHARGES"]
    
    return regressor.predict(df)
    



In [15]:
call_udf = """select predict_churn("MONTHLY CHARGES", "TOTAL CHARGES")
from customer_db.churn.telco_unlabeled"""

print(call_udf)
session.sql(call_udf).collect()

select predict_churn("MONTHLY CHARGES", "TOTAL CHARGES")
from customer_db.churn.telco_unlabeled


[Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONTHLY CHARGES", "TOTAL CHARGES")=0.0),
 Row(PREDICT_CHURN("MONT