In [1]:
from verta._internal_utils._utils import proto_to_json
from verta.monitoring.profiler import ContinuousHistogramProfiler, BinaryHistogramProfiler
from verta.data_types import _VertaDataType
from verta.monitoring import profiler

from verta._protos.public.monitoring.Summary_pb2 import CreateSummarySample
from verta._protos.public.monitoring.DeploymentIntegration_pb2 import FeatureDataInModelVersion

In [2]:
import pandas as pd
import numpy as np
import scipy as sp

In [3]:
import json
import time
import verta
from datetime import datetime, timedelta, timezone

from verta.data_types import (
    DiscreteHistogram,
    FloatHistogram,
    NumericValue,
)

from verta.monitoring.profiler import (
    MissingValuesProfiler,
    BinaryHistogramProfiler,
    ContinuousHistogramProfiler,
)

In [4]:
'''
Notes:
- Requirement will be that the thing passed to profile is the same thing that predict will take in
- Aside: Missing values could be encoded as numeric
- Histogram classes are not json serializable, so unclear what "contents" should be
'''

In [5]:
df = pd.read_csv("/Users/mvartak/Downloads/census-train.csv")

In [6]:
def add_time_attrs_to_feature_data(feature_data):
    time_millis = int(time.time() * 1000)
    feature_data.created_at_millis = time_millis
    feature_data.time_window_start_at_millis = time_millis
    feature_data.time_window_end_at_millis = time_millis
    
def add_labels_to_feature_data(feature_data, labels):
    for key in labels.keys():
        feature_data.labels[key] = labels[key]

def create_missing_value_summary(df, col, labels):
    feature_data = FeatureDataInModelVersion()
    feature_data.feature_name = col
    feature_data.profiler_name = "ContinuousHistogramProfiler"
    feature_data.summary_name = col + "--" + "MissingValues"
    feature_data.summary_type_name = "DiscreteHistogram"    
    sample = MissingValuesProfiler(columns=[col]).profile(df)
    for _, histogram in sample.items(): 
        feature_data.content = json.dumps(DiscreteHistogram(
            buckets = histogram._buckets,
            data = histogram._data,
        )._as_dict())
        break
    add_time_attrs_to_feature_data(feature_data)
    add_labels_to_feature_data(feature_data, labels)
    return feature_data

def create_continuous_histogram_summary(df, col, labels):
    feature_data = FeatureDataInModelVersion()
    feature_data.feature_name = col
    feature_data.profiler_name = "ContinuousHistogramProfiler"
    feature_data.summary_name = col + "--" + "Distribution"
    feature_data.summary_type_name = "FloatHistogram"    
    sample = ContinuousHistogramProfiler(columns=[col]).profile(df)
    for _, histogram in sample.items(): 
        feature_data.content = json.dumps(FloatHistogram(
            bucket_limits = histogram._bucket_limits,
            data = histogram._data,
        )._as_dict())
        feature_data.profiler_parameters = json.dumps({"bins" : histogram._bucket_limits})
        break
    add_time_attrs_to_feature_data(feature_data)
    add_labels_to_feature_data(feature_data, labels)
    return feature_data

def create_discrete_histogram_summary(df, col, labels):
    feature_data = FeatureDataInModelVersion()
    feature_data.feature_name = col
    feature_data.profiler_name = "BinaryHistogramProfiler"
    feature_data.summary_name = col + "--" + "Distribution"
    feature_data.summary_type_name = "DiscreteHistogram"    
    sample = BinaryHistogramProfiler(columns=[col]).profile(df)
    for _, histogram in sample.items(): 
        feature_data.content = json.dumps(DiscreteHistogram(
            buckets = histogram._buckets,
            data = histogram._data,
        )._as_dict())
        feature_data.profiler_parameters = json.dumps({"bins" : histogram._buckets})
        break
    add_time_attrs_to_feature_data(feature_data)
    add_labels_to_feature_data(feature_data, labels)
    return feature_data

def get_metadata_for_df(df):
    metadata = {}
    for column in df:
        metadata[column] = {}
        metadata[column]["num_unique"] = df[column].value_counts().size
        metadata[column]["type"] = str(df[column].dtypes)
    return metadata

def profile_training_data(in_df, out_df):
    # get metadata; currently getting extra info than necessary. May change later
    in_df_metadata = get_metadata_for_df(in_df)
    out_df_metadata = get_metadata_for_df(out_df)
    
    feature_data_list = []
    
    labels_list = [{"col_type" : "input"}, {"col_type" : "output"}]
    metadata_list = [in_df_metadata, out_df_metadata]
    
    # we assume no overlap in names of input and output cols
    for labels, metadata in zip(labels_list, metadata_list):
        for key in metadata.keys():
            # ignore unsupported types. currently only numeric
            if metadata[key]["type"] not in ["float64", "int64"]:
                continue
                
            feature_data_list.append(create_missing_value_summary(df, key, labels))
            if metadata[key]["num_unique"] > 20:
                feature_data_list.append(create_continuous_histogram_summary(df, key, labels))
            else:
                feature_data_list.append(create_discrete_histogram_summary(df, key, labels))
    return feature_data_list

def log_feature_data_and_vis_attributes(feature_data_list, rmv, er):
    for idx in range(len(feature_data_list)):
        rmv.add_attribute(
            "__verta_feature_data_" + str(idx),
            _utils.proto_to_json(feature_data_list[idx], False))
        rmv.add_attribute(
            ("__verta_tdp_" + feature_data_list[idx].summary_name)[:50], 
            json.loads(feature_data_list[idx].content))
#         er.log_attribute(
#             feature_data_list[idx].summary_name, 
#             json.loads(feature_data_list[idx].content))

In [7]:
profile = profile_training_data(df.loc[:, df.columns != '>50k'], pd.DataFrame(df[">50k"]))

In [8]:
profile

In [9]:
log_feature_data_and_vis_attributes(profile, registered_model_version, run)

In [10]:
from verta.registry.entities import RegisteredModel

In [11]:
client = verta.Client("demo.dev.verta.ai")

In [12]:
registered_model = client.create_registered_model("test_model3")

In [13]:
registered_model_version = registered_model.create_version("dummy model8")

In [14]:
registered_model_version.add_attributes({"__verta_feature_data_0" : _utils.proto_to_json(profile[0], False)})

In [15]:
profile[0].content

In [16]:
proj = client.get_or_create_project("monitoring_testing")
expt = client.get_or_create_experiment("test")
run = client.create_experiment_run()

In [17]:
run.log_attributes({"__verta_feature_data_1" : profile[0].content})

In [18]:
run

In [19]:
run.log_attribute("test2", DiscreteHistogram(**json.loads(profile[0]["content"])))

In [20]:
log_training_data_profile()

In [21]:
from verta._internal_utils import _utils
_utils.proto_to_json(profile[0], False),

In [22]:
feature_data_to_attribute_for_vis(profile[0])

In [23]:
run.log_attribute("test6", json.loads(profile[1].content))

In [24]:
run

In [25]:
ers = proj.expt_runs

In [26]:
ers[1]

In [27]:
import numpy as np
import uuid


cont_col = np.random.random(100)
discrete_col = np.random.choice(5, 100)
strs = ['a', 'b', 'c', 'd', 'e']
string_discrete_col =  [strs[x] for x in np.random.choice(5, 100)]
string_free_form =  [uuid.uuid4().hex.upper()[0:10] for x in range(100)]
unsupported_col = [datetime.now() for x in range(100)]

In [28]:
np.random.choice(5, 10)

In [29]:
df2 = pd.DataFrame(list(zip(*[cont_col, discrete_col, string_discrete_col, string_free_form, unsupported_col])))

In [30]:
df2

In [31]:
df2.dtypes

In [32]:
df2.describe()

In [33]:
import wget, os
train_data_url = "http://s3.amazonaws.com/verta-starter/spam.csv"
train_data_filename = wget.detect_filename(train_data_url)
if not os.path.isfile(train_data_filename):
    wget.download(train_data_url)

In [34]:
raw_data = pd.read_csv(train_data_filename, delimiter=',', encoding='latin-1')

In [35]:
raw_data

In [36]:
df2.dtypes

In [37]:
[str.isalnum(x) for x in ">50k"]

In [38]:
"".join([x for x in name if str.isalnum(x)])

In [39]:
name = ">50_k"

In [40]:
"".join([x for x in name if (str.isalnum(x) or x == '_')])

In [41]:
pd.DataFrame(
            list(zip(cont_col, discrete_col, string_discrete_col, string_free_form, unsupported_col))
        )

In [42]:
df = pd.DataFrame(list(zip(lst, lst2)),
               columns =['Name', 'val'])

In [43]:
cont_col = np.random.random(100)
discrete_col = np.random.choice(5, 100)
strs = ['a', 'b', 'c', 'd', 'e']
string_discrete_col =  [strs[x] for x in np.random.choice(5, 100)]
string_freeform_col =  [uuid.uuid4().hex.upper()[0:10] for x in range(100)]
other_col = [datetime.now() for x in range(100)]
output_col = np.random.choice(2, 100)

col_names = ['Continuous_Numeric', 'Discrete_Numeric', 'Discrete_String', "Freeform_String",
        "Other", "Output_Col"]
supported_col_names = ['Continuous_Numeric', 'Discrete_Numeric', "Output_Col"]

# create dataframes
df  = pd.DataFrame(
    list(zip(cont_col, discrete_col, string_discrete_col, string_freeform_col, 
        other_col, output_col)),
    columns = col_names
)
print(df.columns)

   

In [44]:
df.loc[:, df.columns != "Output_Col"]

In [45]:
df.loc[:, df.columns != "Output_Col"], pd.DataFrame(df["Output_Col"])

In [46]:
json.loads(profile[0].content)["discreteHistogram"]["buckets"]

In [47]:
profile[0]

In [48]:
pd.DataFrame([1,2,3])