# Deploying Models with Multiple ML Frameworks on Verta

Within Verta, a "Model" can be any arbitrary function: a traditional ML model (e.g., sklearn, PyTorch, TF, etc); a function (e.g., squaring a number, making a DB function etc.); or a mixture of the above (e.g., pre-processing code, a DB call, and then a model application.) See more [here](https://docs.verta.ai/verta/registry/concepts).

This notebook provides an example of how to deploy a XGBoost + Scikit-learn model on Verta as a Verta Standard Model by extending [VertaModelBase](https://verta.readthedocs.io/en/master/_autogen/verta.registry.VertaModelBase.html?highlight=VertaModelBase#verta.registry.VertaModelBase). The same pattern can be used to deploy models made up of any number of ML Frameworks.

## 0. Imports

In [1]:
from __future__ import print_function

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import os
import time

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
import xgboost as xgb

### 0.1 Verta import and setup

In [2]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

In [3]:
import os

# Ensure credentials are set up, if not, use below
# os.environ['VERTA_EMAIL'] = 
# os.environ['VERTA_DEV_KEY'] = 
# os.environ['VERTA_HOST'] = 

from verta import Client
PROJECT_NAME = "Census"
EXPERIMENT_NAME = "sklearn + xgboost"
client = Client(os.environ['VERTA_HOST'])
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

## 1. Model training

### 1.1 Prepare Data

In [4]:
try:
    import wget
except ImportError:
    !pip install wget  # you may need pip3
    import wget

In [5]:
train_data_url = "http://s3.amazonaws.com/verta-starter/census-train.csv"
train_data_filename = wget.detect_filename(train_data_url)
if not os.path.isfile(train_data_filename):
    wget.download(train_data_url)

test_data_url = "http://s3.amazonaws.com/verta-starter/census-test.csv"
test_data_filename = wget.detect_filename(test_data_url)
if not os.path.isfile(test_data_filename):
    wget.download(test_data_url)

In [6]:
df_train = pd.read_csv(train_data_filename)
X_train_hpw = df_train.drop(columns=["hours-per-week", ">50k"]) # predict hours per week
y_train_hpw = df_train["hours-per-week"]

X_train_income = df_train.drop(columns=["hours-per-week", ">50k"])
y_train_income = df_train[">50k"]

X_train_income.head()

### 1.2 Define hyperparams

In [7]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

### 1.3 Train/test code

In [8]:
def run_experiment(hyperparams, X_train, y_train):
    # create object to track experiment run
    run = client.set_experiment_run()
    
    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
# NOTE: run_experiment() could also be defined in a module, and executed in parallel
for hyperparams in hyperparam_sets:
    run_experiment(hyperparams, X_train_hpw, y_train_hpw)

In [9]:
best_run = expt.expt_runs.sort("metrics.val_acc", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

In [10]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train_hpw, y_train_hpw)
train_acc = model.score(X_train_hpw, y_train_hpw)
predicted_hpw = model.predict(X_train_hpw)
print("Training accuracy: {:.4f}".format(train_acc))

In [11]:
pd.DataFrame(predicted_hpw, index=X_train_hpw.index)

In [12]:
X_train_income_prediction = pd.concat([X_train_income, 
           pd.DataFrame(
               predicted_hpw, columns=["predicted_hpw"],
               index=X_train_income.index)], axis=1)

In [13]:
dtrain = xgb.DMatrix(X_train_income_prediction, label=y_train_income)

In [14]:
grid = model_selection.ParameterGrid({
    'eta': [0.5, 0.7],
    'max_depth': [1, 2, 3],
    'num_class': [10],
})

In [15]:
EXPERIMENT_NAME = "XGBoost"
client.set_experiment(EXPERIMENT_NAME)

In [16]:
def run_experiment(hyperparams, X_train, y_train):
    run = client.set_experiment_run()
    
    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    
    # run cross validation on hyperparameters
    cv_history = xgb.cv(hyperparams, dtrain,
                        nfold=5,
                        metrics=("merror", "mlogloss"))

    # log observations from each iteration
    for _, iteration in cv_history.iterrows():
        for obs, val in iteration.iteritems():
            run.log_observation(obs, val)
            
    # log error from final iteration
    final_val_error = iteration['test-merror-mean']
    run.log_metric("val_error", final_val_error)
    print("{} Mean error: {:.4f}".format(hyperparams, final_val_error))
    
# NOTE: run_experiment() could also be defined in a module, and executed in parallel
for hyperparams in grid:
    run_experiment(
        hyperparams, X_train_income_prediction.to_numpy(), y_train_income.to_numpy())

In [17]:
income_model = xgb.XGBClassifier(**best_hyperparams)
income_model.fit(X_train_income_prediction.to_numpy(), y_train_income.to_numpy())

## 2. Register Model for deployment

In [18]:
registered_model = client.get_or_create_registered_model(
    name="census", labels=["xgboost", "sklearn"])

In [19]:
hpw_model = model

In [20]:
from verta.registry import VertaModelBase

class CensusTwoStep(VertaModelBase):
    def __init__(self, artifacts):
        import cloudpickle
        self.hpw_model = cloudpickle.load(
            open(artifacts["hpw_model"], "rb"))
        self.income_model = cloudpickle.load(
            open(artifacts["income_model"], "rb"))
        
    def predict(self, batch_input):
        import numpy as np
        results = []
        for one_input in batch_input:
            output = self.hpw_model.predict(one_input)
            output = np.concatenate((np.array(one_input), np.reshape(output, (-1,1))), axis=1)
            output = self.income_model.predict(output)
            results.append(output)
        return results

In [21]:
import cloudpickle
cloudpickle.dump(income_model, open("income_model.pkl", "wb"))
cloudpickle.dump(hpw_model, open("hpw_model.pkl", "wb"))

my_model = CensusTwoStep(
    {
        "hpw_model" : "hpw_model.pkl", 
        "income_model" : "income_model.pkl"
    })

In [22]:
my_model.predict([X_train_hpw.values.tolist()[:5]])

In [23]:
from verta.environment import Python
model_version = registered_model.create_standard_model(
    model_cls=CensusTwoStep,
    environment=Python(requirements=["sklearn", "xgboost"]),
    artifacts={
        "hpw_model" : hpw_model,
        "income_model" : income_model
    },
    name="v6"
)

In [24]:
model_version

## 3. Deploy model to endpoint

In [25]:
census_multiple_endpoint = client.get_or_create_endpoint("census-multiple")
census_multiple_endpoint.update(model_version, wait=True)

In [26]:
deployed_model = census_multiple_endpoint.get_deployed_model()
deployed_model.predict([X_train_hpw.values.tolist()[:5]])

---