# Logistic Regression with Grid Search (scikit-learn)

<a href="https://colab.research.google.com/github/VertaAI/modeldb/blob/master/client/workflows/demos/census-dataset-versioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta
    import verta

print("Using Verta version", verta.__version__)

In [2]:
HOST = "XXXXX.app.verta.ai"

PROJECT_NAME = "Census Income Classification"
EXPERIMENT_NAME = "Logistic Regression"
WORKSPACE = "XXXXX"

In [3]:
import os
os.environ['VERTA_EMAIL'] = 'XXXXXX@XXXX.XXX'
os.environ['VERTA_DEV_KEY'] = 'XXXXXXXXXXXXXXXXXXXXX'

In [4]:
# This is an example of creating an AWS credentials file to enable S3 dataset versioning on Colab.
# It is NOT RECOMMENDED to store secrets, such as AWS or Verta credentials, in code.
import os

AWS_ACCESS_KEY_ID = "XXXXX"
AWS_SECRET_ACCESS_KEY = "XXXXX"

aws_config_dir = os.path.expanduser("~/.aws")
if not os.path.exists(aws_config_dir):
    os.makedirs(aws_config_dir)

aws_credentials_filepath = os.path.join(aws_config_dir, "credentials")
if not os.path.exists(aws_credentials_filepath):
    with open(aws_credentials_filepath, 'w') as f:
        f.write('\n'.join([
            "[default]",
            "aws_access_key_id={}".format(AWS_ACCESS_KEY_ID),
            "aws_secret_access_key={}".format(AWS_SECRET_ACCESS_KEY),
        ]))

## Imports

In [5]:
# install additional packages
# NOTE: you may need pip3 instead of pip

!pip install boto3
!pip install wget

In [6]:
from __future__ import print_function

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from datetime import datetime
import itertools
import os
import time

import numpy as np
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

import boto3
import wget

---

# Log Workflow

## Instantiate Client

In [7]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME, workspace=WORKSPACE, public_within_org=True)
expt = client.set_experiment(EXPERIMENT_NAME)

## Prepare Data

In [8]:
from verta.dataset import S3

dataset = client.set_dataset(name="Census Income", workspace=WORKSPACE, public_within_org=True)
def create_dataset_version(url):
    desc = url.strip("/").split("/")[-1]
    _, date_range, customer_name = desc.split("-")
    tags = "customer:" + customer_name
    start_date, end_date = date_range.split("_")

    version = dataset.create_version(
        S3(url), desc=desc, tags=tags,
        attrs={"start_date": start_date, "end_date": end_date},
    )
    
# expected naming convention is s3 base path - <mmddyyyy::mmddyyyy> - <customername>
create_dataset_version("s3://verta-starter/Demo-01012021_01022021-acme/")
create_dataset_version("s3://verta-starter/Demo-01012021_01022021-abc/")
create_dataset_version("s3://verta-starter/Demo-01022021_01032021-acme/")
create_dataset_version("s3://verta-starter/Demo-01022021_01032021-abc/")
create_dataset_version("s3://verta-starter/Demo-01032021_01042021-acme/")
create_dataset_version("s3://verta-starter/Demo-01032021_01042021-abc/")

## Merge Data 

In [9]:
# get files for specific customers and date range
def get_dataset_versions_for_customer_daterange(customer, start_date, end_date):
    # filter dataset versions by customer + date range
    customer_versions = dataset.versions.find("tags == customer:" + customer)
    filtered_versions = filter(
        lambda datasetv: datetime.strptime(datasetv.get_attribute("start_date"), "%m%d%Y") >= start_date
                     and datetime.strptime(datasetv.get_attribute("end_date"), "%m%d%Y") <= end_date,
        customer_versions
    )
    
    return list(filtered_versions)


start_date = datetime(2021, 1, 3)
end_date   = datetime(2021, 1, 4)
customers = ["acme", "abc"]
final_tags = "customer:" + ";".join(customers)
final_filtered_versions = []
for customer in customers: 
    final_filtered_versions.extend(get_dataset_versions_for_customer_daterange(customer, start_date, end_date))

# merge content
final_content = None
for datasetv in final_filtered_versions:
    if final_content is None:
        final_content = datasetv.get_content()
    else:
        final_content += datasetv.get_content()
print(final_content)

# create new dataset version using merged content
final_version = dataset.create_version(
    final_content, tags=final_tags,
    attrs={"start_date": start_date.strftime("%m%d%Y"), "end_date": end_date.strftime("%m%d%Y")},
)
final_version

In [10]:
DATASET_PATH = "./"

def download_dataset(s3_url, local_path):
    s3_url = s3_url.replace("s3://", "http://s3.amazonaws.com/")
    print(s3_url)
    if not os.path.isfile(local_path):
        wget.download(s3_url)

train_urls = []
test_urls = []
for s3_url in final_version.get_content().list_paths():
    if os.path.basename(s3_url).startswith("census-train"):
        train_urls.append(s3_url)
    elif os.path.basename(s3_url).startswith("census-test"):
        test_urls.append(s3_url)

local_train_paths = []
local_test_paths = []
for train_url, test_url in zip(train_urls, test_urls):
    train_local_path = DATASET_PATH + os.path.basename(train_url)
    test_local_path = DATASET_PATH + os.path.basename(test_url)
    
    download_dataset(train_url, train_local_path)
    download_dataset(test_url, test_local_path)
    
    local_train_paths.append(train_local_path)
    local_test_paths.append(test_local_path)

# Merge train and test datasets
merged_train_csv = pd.concat([pd.read_csv(f) for f in local_train_paths])
merged_train_csv.to_csv("./merged_train.csv", index=False, encoding='utf-8-sig')

merged_test_csv = pd.concat([pd.read_csv(f) for f in local_test_paths])
merged_test_csv.to_csv("./merged_test.csv", index=False, encoding='utf-8-sig')

In [11]:
df_train = pd.read_csv("./merged_train.csv")
X_train = df_train.iloc[:,:-1]
Y_train = df_train.iloc[:, -1]

df_train.head()

## Prepare Hyperparameters

In [12]:
import itertools
hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [15, 28],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Train Models

In [13]:
def run_experiment(hyperparams):
    
    # create object to track experiment run
    run = client.set_experiment_run()
    
    # create validation split
    (X_val_train, X_val_test,
     Y_val_train, Y_val_test) = model_selection.train_test_split(X_train, Y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, Y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val_test, Y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # create deployment artifacts
    model_api = ModelAPI(X_train, model.predict(X_train))
    requirements = ["scikit-learn"]
    
    # save and log model
    run.log_model(model, model_api=model_api, custom_modules=[])
    run.log_requirements(requirements)
    
    # log training data
    run.log_dataset_version("train", final_version)
    
# NOTE: run_experiment() could also be defined in a module, and executed in parallel
for hyperparams in hyperparam_sets:
    run_experiment(hyperparams)

## Revisit Workflow
This section demonstrates querying and retrieving runs via the Client.

## Retrieve Best Run

In [14]:
best_run = expt.expt_runs.sort("metrics.val_acc", descending=True)[0]
print("Validation Accuracy: {:.4f}".format(best_run.get_metric("val_acc")))

best_hyperparams = best_run.get_hyperparameters()
print("Hyperparameters: {}".format(best_hyperparams))

## Staging
The best-performing model can be staged as a registered model, for use downstream.

In [15]:
registered_model = client.get_or_create_registered_model(name="Census", workspace=WORKSPACE, public_within_org=True)

In [16]:
registered_model.create_version_from_run(best_run.id, name="v0")

## Deploy models
This registered model version can be deployed to an endpoint, whereupon predictions can be made via a REST endpoint or through the client.

In [17]:
registered_model = client.get_registered_model(name="Census", workspace=WORKSPACE)
model_version = registered_model.get_version(name="v0")
print(model_version)

## Create and update an endpoint

In [18]:
endpoint = client.get_or_create_endpoint(path="/Census", workspace=WORKSPACE, public_within_org=True)

In [19]:
endpoint.update(model_version, wait=True)

## Prepare live data

In [20]:
df_test = pd.read_csv("./merged_test.csv")
X_test = df_test.iloc[:,:-1]

## Query deployed model

In [21]:
model_version

In [22]:
deployed_model = endpoint.get_deployed_model()

for x in itertools.cycle(X_test.values.tolist()):
    print(deployed_model.predict([x]))
    time.sleep(.5)

---