## Initialize Verta's client

In [1]:
from verta import Client
from verta.utils import ModelAPI
client = Client("https://demo.dev.verta.ai")

set email from environment
set developer key from environment
connection successfully established


## Download dataset

Load the latest version of the dataset.

In [2]:
dataset_version = client.set_dataset("Webinar 3 - Census", workspace="Demos").get_latest_version()

set existing Dataset: Webinar 3 - Census from workspace: Demos


Download just the ones we'll need this time.

In [3]:
import os
import wget

files = {file.path: file for file in dataset_version.list_components()}

desired_files = ["census-train.csv", "census-test.csv"]

for filename in desired_files:
    file = files[filename]
    if not os.path.exists(filename):
        # This example isn't using Verta's managed datasets, so it's much harder to know from where to download it!
        url = "http://s3.amazonaws.com/" + file.base_path + "/" + file.path
        wget.download(url)

Prepare the dataset locations and content.

In [4]:
DATASET_PATH = "./"
train_data_filename = DATASET_PATH + "census-train.csv"
test_data_filename = DATASET_PATH + "census-test.csv"

In [5]:
import pandas as pd

df_train = pd.read_csv(train_data_filename)
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:, -1]

df_train.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,>50k
0,44,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,53,7298,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,49,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,53,0,1485,40,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


## Train a few models

Prepare hyperparameters.

In [6]:
import itertools

hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

Train and log along the way.

In [7]:
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

client.set_project("Webinar 3 - Census")
client.set_experiment("Logistic Regression")

def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run()
    
    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # create deployment artifacts
    model_api = ModelAPI(X_train, y_train)
    requirements = ["scikit-learn"]
    
    # save and log model
    run.log_model(model, model_api=model_api)
    run.log_requirements(requirements)
    
    # log dataset snapshot as version
    run.log_dataset_version("train", dataset_version)
    
    # log reference data for monitoring
    run.log_training_data(X_train, y_train)
    
    # log Git information as code version
    run.log_code()
    
for hyperparams in hyperparam_sets:
    run_experiment(hyperparams)

got existing Project: Webinar 3 - Census
got existing Experiment: Logistic Regression
created new ExperimentRun: Run 535941603465370816938
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 15} 



Validation accuracy: 0.7913
uploading part 1
upload complete (custom_modules)
uploading part 1
upload complete (model.pkl)
uploading part 1
upload complete (model_api.json)
uploading part 1
upload complete (requirements.txt)
Git repository successfully located at /Users/conrado/workspace/modeldb/
created new ExperimentRun: Run 535941603465403861834
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 28} 



Validation accuracy: 0.7891
uploading part 1
upload complete (custom_modules)
uploading part 1
upload complete (model.pkl)
uploading part 1
upload complete (model_api.json)
uploading part 1
upload complete (requirements.txt)
Git repository successfully located at /Users/conrado/workspace/modeldb/
created new ExperimentRun: Run 5359416034654272444081
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 15} 



Validation accuracy: 0.7964
uploading part 1
upload complete (custom_modules)
uploading part 1
upload complete (model.pkl)
uploading part 1
upload complete (model_api.json)
uploading part 1
upload complete (requirements.txt)
Git repository successfully located at /Users/conrado/workspace/modeldb/
created new ExperimentRun: Run 535941603465456957668
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 28} 



Validation accuracy: 0.7915
uploading part 1
upload complete (custom_modules)
uploading part 1
upload complete (model.pkl)
uploading part 1
upload complete (model_api.json)
uploading part 1
upload complete (requirements.txt)
Git repository successfully located at /Users/conrado/workspace/modeldb/


## Run predictions against the model

In [6]:
endpoint = client.get_endpoint(path="/d409eec2-74c6-44ac-ae20-f1b041b3b06e")
deployed_model = endpoint.get_deployed_model()

Run the test dataset against the model

In [13]:
import time
import itertools

df_test = pd.read_csv(test_data_filename)
X_test = df_test.iloc[:,:-1]

for x in itertools.cycle(X_test.values.tolist()):
    print(deployed_model.predict([x]))
    time.sleep(.2)

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]


KeyboardInterrupt: 

In [15]:
X_test_pred = X_test.copy()
X_test_pred['prediction'] = deployed_model.batch_predict(X_test_pred)
X_test_pred.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,prediction
0,10.991875,0.0,0.0,40.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,4.235904,0.0,0.0,50.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,35.806474,0.0,0.0,43.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,65.316701,0.0,0.0,50.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,87.053437,0.0,0.0,50.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
