In [18]:
{    "tags": [
        "hide-output",
        "hide-input"
    ]
}

VERTA_DOCKER=0
SPARK_UDF=1
PG_UDF=2

def export_as(self, pkg_format=VERTA_DOCKER, **kwargs):
    print(kwargs)
    if pkg_format == SPARK_UDF:
        # Create a spark UDF > load UDF into SparkSession initiatied above. UDF named "xsell_udf"
        import mlflow.sklearn
        import mlflow.pyfunc
        import secrets
        random_dir = secrets.token_hex(4)
        # Construct and save the model
        model_path = "/opt/models/" + random_dir
        mlflow.sklearn.save_model(self.get_model(), model_path)
        # Create udf based on saved model location and load into cluster. UDF named xsell_udf
        spark_udf = mlflow.pyfunc.spark_udf(kwargs["spark_session"], model_path)
        return spark_udf
    elif pkg_format == VERTA_DOCKER:
        pass
    elif pkg_format == PG_UDF:
        pass

In [19]:
from verta.registry._entities import RegisteredModelVersion

RegisteredModelVersion.export_as = export_as

<h1><center>Health Insurance Cross Sell Prediction</center></h1>
We are an Insurance company that provides Health Insurance to our customers. We need to build a model to predict whether the policyholders (customers) from last year will also be interested in Vehicle Insurance that we provide and sell too. 


In [20]:
# Stop spark session if needed
sc.stop()

In [21]:
# Instantiate spark session, code subject to change based on your spark env
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3.9'
conf = SparkConf().setAppName("VertaSpark").setMaster("spark://ip-172-31-8-213.us-west-2.compute.internal:7077")
sc = SparkContext(conf=conf)
sc.getConf().getAll()
sess = SparkSession(sc)

# Basic Verta setup

In [22]:
import verta, os
HOST = "XXXXX.verta.ai"

PROJECT_NAME = "Insurance Life2Auto Cross-sell"
EXPERIMENT_NAME = "Logistic Regression"
WORKSPACE = "XXXXX"
os.environ['VERTA_EMAIL'] = "XXXXXX"
os.environ['VERTA_DEV_KEY'] = "XXXXXXX"

# Instantiate client and libs

In [23]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME, workspace=WORKSPACE)
expt = client.set_experiment(EXPERIMENT_NAME)

set email from environment
set developer key from environment
connection successfully established
got existing Project: Insurance Life2Auto Cross-sell
got existing Experiment: Logistic Regression


## Spark UDF

In [24]:
registered_model = client.get_registered_model(name="Life2AutoCrossSell", workspace=WORKSPACE)
model_version=registered_model.get_version(name="v3")

In [25]:
model_as_spark_udf = model_version.export_as(pkg_format=SPARK_UDF, spark_session=sess)

{'spark_session': <pyspark.sql.session.SparkSession object at 0x7f29b8265a90>}


## Grab some data

In [26]:
# grab some test data
import wget
import pandas as pd

test_data_url = "https://verta-demo.s3-us-west-2.amazonaws.com/xselltest.csv"
test_data_filename = wget.detect_filename(test_data_url)
if not os.path.isfile(test_data_filename):
    wget.download(test_data_url)
test = pd.read_csv(test_data_filename)
test = test.drop(['id'], axis=1)

test.loc[test['Gender'] == 'Male', 'Gender'] = 1
test.loc[test['Gender'] == 'Female', 'Gender'] = 0

test.loc[test['Vehicle_Age'] == '> 2 Years', 'Vehicle_Age'] = 2
test.loc[test['Vehicle_Age'] == '1-2 Year', 'Vehicle_Age'] = 1
test.loc[test['Vehicle_Age'] == '< 1 Year', 'Vehicle_Age'] = 0

test.loc[test['Vehicle_Damage'] == 'Yes', 'Vehicle_Damage'] = 1
test.loc[test['Vehicle_Damage'] == 'No', 'Vehicle_Damage'] = 0

In [None]:
# Run UDF on dataset, display as df
cols = test.columns
dfs = sess.createDataFrame(test)
dfs = dfs.withColumn('prediction', model_as_spark_udf(*cols.values))
pandas_df = dfs.toPandas()
pandas_df.sample(25)

In [None]:
test.columns