In [8]:
import os, pandas as pd
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
from tensorflow_metadata.proto.v0 import anomalies_pb2
from google.protobuf import text_format



### Generate statistics (train)

In [4]:
TRAIN_CSV = "gs://uci-bank-ml-bucket/bank-additional/bank-additional-full.csv"
train_stats = tfdv.generate_statistics_from_csv(
    data_location=TRAIN_CSV,
    delimiter=';'
)
tfdv.visualize_statistics(train_stats)




Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [None]:
GCS_STATS_PATH = "gs://uci-bank-ml-bucket/bank-additional/bank-additional-full_stats.stats"

# Save the statistics to a file
tfdv.write_stats_text(train_stats, GCS_STATS_PATH)

loaded_stats = tfdv.load_stats_text(GCS_STATS_PATH)

### Infer schema

In [5]:
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,-
'job',STRING,required,,'job'
'marital',STRING,required,,'marital'
'education',STRING,required,,'education'
'default',STRING,required,,'default'
'housing',STRING,required,,'housing'
'loan',STRING,required,,'loan'
'contact',STRING,required,,'contact'
'month',STRING,required,,'month'
'day_of_week',STRING,required,,'day_of_week'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'job',"'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown'"
'marital',"'divorced', 'married', 'single', 'unknown'"
'education',"'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown'"
'default',"'no', 'unknown', 'yes'"
'housing',"'no', 'unknown', 'yes'"
'loan',"'no', 'unknown', 'yes'"
'contact',"'cellular', 'telephone'"
'month',"'apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep'"
'day_of_week',"'fri', 'mon', 'thu', 'tue', 'wed'"
'poutcome',"'failure', 'nonexistent', 'success'"


In [6]:
GCS_SCHEMA_PATH = "gs://uci-bank-ml-bucket/bank-additional/bank-additional-full_schema.textproto"
tfdv.write_schema_text(schema, GCS_SCHEMA_PATH)

### Validate data against schema (anomaly detection)

In [7]:
anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema)
tfdv.display_anomalies(anomalies)


In [9]:
anomalies_path = "gs://uci-bank-ml-bucket/bank-additional/anomalies.pbtxt"
tfdv.write_anomalies_text(anomalies, anomalies_path)