# Logistic Regression (scikit-learn) Experiment Versioning & Registry

<a href="https://colab.research.google.com/github/VertaAI/modeldb/blob/master/client/workflows/demos/census-experiment-versioning-registry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic Verta Setup

In [1]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

In [2]:
#!pip install -e ~/modeldb/client/verta

This example features:
- **scikit-learn**'s `LinearRegression` model
- **verta** model versioning and experiment tracking
- **verta** model staging and registry

In [3]:
HOST = "demo.dev.verta.ai"
PROJECT_NAME = "Census Income Classification"
EXPERIMENT_NAME = "Logistic Regression"
#WORKSPACE = "stage-testing"

In [4]:
import os
os.environ['VERTA_EMAIL'] = 'meeta@verta.ai'
os.environ['VERTA_DEV_KEY'] = 'XXXXXX'

In [5]:
from __future__ import print_function

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import os
import time

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

In [6]:
try:
    import wget
except ImportError:
    !pip install wget  # you may need pip3
    import wget

---

# Phase 1: Model Development

This section demonstrates logging model metadata and training artifacts to ModelDB.

## Instantiate client

In [7]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME, public_within_org=True)
expt = client.set_experiment(EXPERIMENT_NAME)

## Prepare data

In [8]:
train_data_url = "http://s3.amazonaws.com/verta-starter/census-train.csv"
train_data_filename = wget.detect_filename(train_data_url)
if not os.path.isfile(train_data_filename):
    wget.download(train_data_url)

test_data_url = "http://s3.amazonaws.com/verta-starter/census-test.csv"
test_data_filename = wget.detect_filename(test_data_url)
if not os.path.isfile(test_data_filename):
    wget.download(test_data_url)

In [9]:
from verta.dataset import Path

dataset = client.set_dataset(name="Census Income Local-new", public_within_org=True)
dataset_version = dataset.create_version(Path(train_data_filename))

In [10]:
df_train = pd.read_csv(train_data_filename)
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:, -1]

df_train.head()

## Prepare hyperparameters

In [11]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [115, 228],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Train models

In [12]:
def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run()
    
    # log attributes
    run.log_attributes({
        'library': "scikit-learn",
        'model_type': "logistic regression",
    })
    
    # log confusion matrix

    from verta.data_types import ConfusionMatrix
    data = ConfusionMatrix(
    value=[
        [650000, 100000],
        [24000, 3330000],
    ],
    labels=["spam", "not spam"],
    )
    run.log_attribute("Spam_Confusion_Matrix", data)
    
    # log discrete histogram
 
    from verta.data_types import DiscreteHistogram
    data = DiscreteHistogram(
    buckets=["yes", "no", "dont know"],
    data=[1100, 22200,15000],
    )
    run.log_attribute("Response_Histogram", data)
    
    # log float histogram

    from verta.data_types import FloatHistogram
    data = FloatHistogram(
    bucket_limits=[1, 13, 25, 37, 49, 61,72,89],
    data=[15, 53, 91, 34, 7, 17, 27],
    )
    run.log_attribute("Age_Histogram", data)
    
    # log line chart
    
    from verta.data_types import Line
    data = Line(
    x=[1, 2, 3,17,18,24,33,44,58,67],
    y=[1, 4, 9,90,45,34,34,78,14,45],
    )
    run.log_attribute("Price_Over_Time", data) 
        
    # log tupple line chart

    from verta.data_types import Line
    data = Line.from_tuples(
    [(1, 1), (2, 4), (3, 9),(7,10),(40,10),(500,20)],
    )
    run.log_attribute("Custom_Price_Chart", data)

    # log confusion matrix

    from verta.data_types import Matrix
    data = Matrix([
    [1000, 200, 35],
    [4400, 550, 60],
    [7890, 85, 9000],
    ])
    run.log_attribute("Confusion_Matrix", data)
    
    # log table

    from verta.data_types import Table
    data = Table(
    data=[[1, 24, "blue"], [2, 36, "red"]],
    columns=["id", "height", "color"],
    )
    run.log_attribute("Measurements", data)
    
    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # create deployment artifacts
    model_api = ModelAPI(X_train, model.predict(X_train))
    requirements = ["scikit-learn"]
    
    # save and log model
    run.log_model(model, model_api=model_api, custom_modules=[])
    run.log_requirements(requirements)
    
    # log training data
    run.log_dataset_version("census_data", dataset_version)  # log dataset metadata
    
    # log git information
    run.log_code(
        repo_url="git@github.com:VertaAI/modeldb.git",
        commit_hash="d412a0d9",
        autocapture=False,
    )
    
# NOTE: run_experiment() could also be defined in a module, and executed in parallel
for hyperparams in hyperparam_sets:
    run_experiment(hyperparams)

---

# Revisit Workflow

This section demonstrates querying and retrieving runs via the Client.

## Retrieve best run

In [13]:
best_run = expt.expt_runs.sort("metrics.val_acc", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

## Train on full dataset

In [14]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)
model.fit(X_train, y_train)

## Calculate accuracy on full training set

In [15]:
train_acc = model.score(X_train, y_train)
print("Training accuracy: {:.4f}".format(train_acc))

---

# Phase 2: Staging

## Register the best perfoming model, for use downstream

In [16]:
registered_model = client.get_or_create_registered_model(name="Regsitry-demo-model-1", public_within_org=True)
model_version = registered_model.create_version_from_run(best_run.id, name="v3")

Fetch the latest model version ready for staging

In [17]:
latest_model_version = registered_model.versions.sort("time_updated", descending=True)[0]
print(latest_model_version)

Get artifacts of the specific version

In [18]:
#dev_model_version = registered_model.versions.sort("time_updated", descending=True)[0]
#dev_model_version = registered_model.versions.find("stage == development").sort("time_created")
#dev_model_version = registered_model.versions.find("stage == Staging")[0]
dev_model_version = registered_model.versions.find("stage == production")[0]
#dev_model_version = registered_model.versions.find("stage == Archived")[0]
print(dev_model_version)

In [19]:
model_artifact = latest_model_version.get_artifact_keys()
print(model_artifact)