# Monitoring Regression Model - Integrated Registry, Endpoint, Monitoring¶


Verta can automatically monitor any model deployed via the Verta deployment system. 

This notebook shows how a regression model on tabular data can be monitored in Verta.

## 0. Imports

In [None]:
# restart your notebook if prompted on Colab
#!python -m pip install verta
#!python -m pip install wget

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

### 0.1 Verta import and setup

In [2]:
import os

# Ensure credentials are set up, if not, use below
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''
# os.environ['VERTA_HOST'] = ''

os.environ['VERTA_EMAIL'] = 'meeta@verta.ai'
os.environ['VERTA_DEV_KEY'] = '54b3faff-9059-49fe-8e2b-565c1e4c78e8'
os.environ['VERTA_HOST'] = 'staging.dev.verta.ai'

from verta import Client
client = Client(os.environ['VERTA_HOST'])

got VERTA_EMAIL from environment
got VERTA_DEV_KEY from environment
connection successfully established


## 1. Model Training

### 1.1 Load training data

In [3]:
# Load data
import wget

melbourne_file_path = "melb-data.csv"
if not os.path.isfile(melbourne_file_path):
    wget.download("http://s3.amazonaws.com/verta-starter/" + melbourne_file_path)

melbourne_data = pd.read_csv(melbourne_file_path)

In [4]:
# Filter rows with missing values
melbourne_data = melbourne_data.dropna(axis=0)

# Choose target and features
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'Distance',
                        'YearBuilt', 'Car', 'Propertycount']
X = melbourne_data[melbourne_features]

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0)

In [5]:
# Normalize prices to base unit of $1M
y_train = y_train / 1e6
y_test = y_test / 1e6

### 1.2 Train/test code

In [6]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)
melb_preds = forest_model.predict(X_test)
print(mean_absolute_error(y_test, melb_preds))

0.2233983346137293


## 2. Register Model for deployment

In [7]:
import cloudpickle
with open("model.pkl", "wb") as f:
    cloudpickle.dump(forest_model, f)

In [8]:
from verta.registry import VertaModelBase, verify_io

class HousingPriceRegressor(VertaModelBase):
    def __init__(self, artifacts):
        self.model = cloudpickle.load(open(artifacts["serialized_model"], "rb"))
        
    @verify_io
    def predict(self, batch_input):
        return self.model.predict(batch_input).tolist()

In [9]:
artifacts_dict = {"serialized_model" : "model.pkl"}
clf = HousingPriceRegressor(artifacts_dict)
clf.predict(X_test.values.tolist()[:5])

[0.9889150000000002,
 0.5177234399999994,
 0.8662029999999997,
 1.5177249999999995,
 0.732605]

In [10]:
registered_model = client.get_or_create_registered_model(
    name="melbourne-housing-data")

created new RegisteredModel: melbourne-housing-data in workspace: Test-Mar7


In [None]:
from verta.environment import Python
from verta.utils import ModelAPI

model_version = registered_model.create_standard_model(
    model_cls=HousingPriceRegressor,
    environment=Python(requirements=["scikit-learn"]),
    artifacts=artifacts_dict,
    name="v1",
    model_api=ModelAPI(X_train, y_train)
)

created new ModelVersion: v1
uploading serialized_model to Registry
uploading part 1

In [None]:
# Add model type data so the system can compute appropriate model performance metrics

model_version.add_attributes({
    'model_type': "regression",
 })

## 3. Log reference data

Upload your reference data as an artifact in your Regstered Model Version. This is your training dataset and will help facilitate downstream drift monitoring against this reference set. You dont need to upload your entire training set, but a statistically significant representation that mirrors your training/reference data distribution.

In [None]:
model_version.log_reference_data(X_train, y_train)

## 4. Deploy model to endpoint

In [None]:
endpoint = client.get_or_create_endpoint("melbourne-housing-data")
endpoint.update(model_version, wait=True)

## 5. Run predictions and log groundtruth

In [None]:
def simulate_predictions(endpoint, deployed_model, input_data, ground_truth, col_name, ground_truth_delay): 
    # ground_truth_delay is delay in seconds between prediction & GT becoming available
    import time
    
    ids = []
    for i, row in X_train.iterrows():
        _id, _ = deployed_model.predict_with_id([row.tolist()])
        ids.append(_id)

    time.sleep(ground_truth_delay)
    
    id_and_gt = zip(ids, ground_truth)
    
    for t in id_and_gt:
        endpoint.log_ground_truth(t[0], [t[1]], col_name) # id, gt, prediction_col_name

In [None]:
deployed_model = endpoint.get_deployed_model()

In [None]:
print (deployed_model)

In [None]:
simulate_predictions(endpoint, deployed_model, X_test.values.tolist()[0:2], y_test.values.tolist()[0:2], "Price", 10)