# Deploying a scikit-learn model on Verta

Within Verta, a "Model" can be any arbitrary function: a traditional ML model (e.g., sklearn, PyTorch, TF, etc); a function (e.g., squaring a number, making a DB function etc.); or a mixture of the above (e.g., pre-processing code, a DB call, and then a model application.) See more [here](https://docs.verta.ai/verta/registry/concepts).

This notebook provides an example of how to deploy a scikit-learn model on Verta as a Verta Standard Model either via  convenience functions or by extending [VertaModelBase](https://verta.readthedocs.io/en/master/_autogen/verta.registry.VertaModelBase.html?highlight=VertaModelBase#verta.registry.VertaModelBase).

## 0. Imports

In [None]:
# restart your notebook if prompted on Colab
!python -m pip install verta
!python -m pip install wget

In [1]:
from __future__ import print_function

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

import itertools
import os
import time

import numpy as np
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

### 0.1 Verta import and setup

In [None]:
import os

# Ensure credentials are set up, if not, use below
# os.environ['VERTA_EMAIL'] = 
# os.environ['VERTA_DEV_KEY'] = 
# os.environ['VERTA_HOST'] = 

from verta import Client

PROJECT_NAME = "Census-example"
EXPERIMENT_NAME = "sklearn-example"
client = Client()
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

## 1. Model Training

### 1.1 Load training data

In [3]:
import wget

train_data_url = "http://s3.amazonaws.com/verta-starter/census-train.csv"
train_data_filename = wget.detect_filename(train_data_url)
if not os.path.isfile(train_data_filename):
    wget.download(train_data_url)

test_data_url = "http://s3.amazonaws.com/verta-starter/census-test.csv"
test_data_filename = wget.detect_filename(test_data_url)
if not os.path.isfile(test_data_filename):
    wget.download(test_data_url)

In [None]:
from verta.dataset import Path

dataset = client.set_dataset(name="Census Income")
dataset_version = dataset.create_version(
    Path(train_data_filename),  # could also be S3("s3://verta-starter")
)

In [5]:
df_train = pd.read_csv(train_data_filename)
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv(test_data_filename)
X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:, -1]


df_train.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,>50k
0,44,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,53,7298,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,49,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,53,0,1485,40,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


#### Define hyperparams

In [6]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

### 1.3 Train/test code

In [None]:
def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run()
    
    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    run.log_dataset_version("train", dataset_version)
    
    run.log_code()
    
# NOTE: run_experiment() could also be defined in a module, and executed in parallel
for hyperparams in hyperparam_sets:
    run_experiment(hyperparams)

In [8]:
best_run = expt.expt_runs.sort("metrics.val_acc", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

Validation Accuracy: 0.8014
Hyperparameters: {'C': 0.0001, 'max_iter': 15, 'solver': 'lbfgs'}


In [9]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train, y_train)
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))

Training accuracy: 0.7902


## 2. Register Model for deployment

In [None]:
registered_model = client.get_or_create_registered_model(
    name="census-sklearn-example", labels=["tabular", "sklearn", "pandas"])

### Register a serialized version of the model using the VertaModelBase

In [11]:
import cloudpickle
with open("model.pkl", "wb") as f:
    cloudpickle.dump(model, f)

From verta>=0.22.2, it is possible to pass pandas DataFrames as an argument to batch_predict and get a DataFrame in return.

**NOTE:** *batch_predict* function should take a pandas DataFrame as a parameter and return a DataFrame

For more details, visit https://docs.verta.ai/verta/deployment/guides/batch-predictions

In [12]:
from verta.registry import VertaModelBase, verify_io

class CensusIncomeClassifier(VertaModelBase):
    def __init__(self, artifacts):
        self.logreg = cloudpickle.load(open(artifacts["serialized_model"], "rb"))
        
    @verify_io
    def predict(self, input):
        if len(input) == 1:
            return list(map(float,self.logreg.predict([input])))
        return list(map(float,self.logreg.predict(input)))

    def batch_predict(self, df):
        df['preds>50k'] = list(map(float,self.logreg.predict(df)))
        return df
    
    # Optional: populates the model playground
    def describe(self):
        """Return a description of the service."""
        return {
            "method": "predict",
            "args": ",".join(list(X_train.columns)),
            "returns": "income_label",
            "description": """
                Predicts whether a person has >50k income based on census data.
            """,
            "input_description": """
                Batch of census information, one sample per entry.
            """,
            "output_description": """
                Binary classification, with 1 representing the prediction that the
                person earns more than 50k a year.
            """
        }
    
    # Optional: populates the model playground
    def example(self):
        """Return example input json-serializable data."""
        return [
            [71.67822567370767, 0.0, 0.0, 99.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            [6.901547652701675, 0.0, 1887.0, 50.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            [72.84132724180968, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        ]

In [13]:
artifacts_dict = {"serialized_model" : "model.pkl"}
clf = CensusIncomeClassifier(artifacts_dict)
print(f"predict_fn: {clf.predict(clf.example())}")
print(f"batch_predict_fn: {clf.batch_predict(X_test.iloc[:50])['preds>50k'].to_list()}")

predict_fn: [0.0, 1.0, 0.0]
batch_predict_fn: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preds>50k'] = list(map(float,self.logreg.predict(df)))


In [None]:
from verta.environment import Python
from verta.utils import ModelAPI

model_version = registered_model.create_standard_model(
    model_cls=CensusIncomeClassifier,
    environment=Python(requirements=["scikit-learn", "pandas"]),
    code_dependencies=[],
    artifacts=artifacts_dict,
    model_api=ModelAPI(X_train, y_train),
    name="v1",
)

## 3. Deploy model to endpoint

In [16]:
census_endpoint = client.get_or_create_endpoint("census-sklearn-example-v1")
census_endpoint.update(model_version, wait=True)

waiting for update...............................


{'components': [{'build_id': 14390, 'ratio': 1, 'status': 'running'}],
 'creator_request': {'enable_prediction_authz': False, 'name': 'production'},
 'date_created': '2023-03-29T22:20:20.000Z',
 'date_updated': '2023-03-29T22:22:48.000Z',
 'status': 'active',
 'stage_id': 14657}

In [17]:
deployed_model = census_endpoint.get_deployed_model()

*Predict* function only allows inputs that is JSON serializable thus usually lists. It doesn't accept DataFrames

In [18]:
deployed_model.predict(X_test.values.tolist()[:5])

[0.0, 0.0, 0.0, 0.0, 0.0]

*batch_predict* allows DataFrame inputs and is a better way to efficiently generate predictions for larger volumes of data compared to *predict* 

In [19]:
deployed_model.batch_predict(X_test)

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,preds>50k
0,10.991875,0.0,0.0,40.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,4.235904,0.0,0.0,50.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,35.806474,0.0,0.0,43.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,65.316701,0.0,0.0,50.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,87.053437,0.0,0.0,50.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9040,59.172154,0.0,0.0,55.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9041,27.202254,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9042,89.191800,0.0,0.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9043,64.818989,0.0,0.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#delete endpoint
#census_endpoint.delete()

---