In [30]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv

from tensorflow_metadata.proto.v0 import schema_pb2

print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

TFDV Version: 1.14.0
Tensorflow Version: 2.15.0


In [20]:
train_df = pd.read_csv("data/train_cleaned.csv")
test_df = pd.read_csv("data/test_cleaned.csv")

In [21]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [22]:
# Generate statics from training data
train_stats = tfdv.generate_statistics_from_dataframe(train_df)

In [23]:
# Visualize statistics from generated statistics by tfdv
tfdv.visualize_statistics(train_stats)

In [24]:
# Infer schema from computed statistics
schema = tfdv.infer_schema(statistics=train_stats)

# Display the inferred schema
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,-
'workclass',STRING,required,,'workclass'
'fnlwgt',INT,required,,-
'education',STRING,required,,'education'
'education_num',INT,required,,-
'marital_status',STRING,required,,'marital_status'
'occupation',STRING,required,,'occupation'
'relationship',STRING,required,,'relationship'
'race',STRING,required,,'race'
'sex',STRING,required,,'sex'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'marital_status',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'race',"'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'"
'sex',"'Female', 'Male'"
'native_country',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'income',"' <=50K', ' >50K'"


In [25]:
# Generate statistics for test_data
test_stats = tfdv.generate_statistics_from_dataframe(test_df)

In [26]:
# Visualize the test_stats
tfdv.visualize_statistics(test_stats)

In [27]:
# Visualize the train and test statistics to compare
tfdv.visualize_statistics(
    lhs_statistics= test_stats,
    rhs_statistics= train_stats,
    lhs_name= "TEST_DATASET",
    rhs_name= "TRAIN_DATASET"
)

The datasets from this view came from the same distribution and there is nothing to be addressed let's calculate and visualize the anomalies if there are any in our evaluation set so we can fix them

In [28]:
# Validate the test data computed statistics from the reference schema of the data that will be used for training
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

In [29]:
tfdv.display_anomalies(anomalies)

In [32]:
# Restrict the 'age' feature range since we know the valid expected values for age range
tfdv.set_domain(schema , 'age', schema_pb2.IntDomain(name='age', min = 17, max = 90))

# Display schema 
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,min: 17; max: 90
'workclass',STRING,required,,'workclass'
'fnlwgt',INT,required,,-
'education',STRING,required,,'education'
'education_num',INT,required,,-
'marital_status',STRING,required,,'marital_status'
'occupation',STRING,required,,'occupation'
'relationship',STRING,required,,'relationship'
'race',STRING,required,,'race'
'sex',STRING,required,,'sex'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'marital_status',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'race',"'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'"
'sex',"'Female', 'Male'"
'native_country',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'income',"' <=50K', ' >50K'"


In [34]:
anomalies = tfdv.validate_statistics(test_stats, schema)

tfdv.display_anomalies(anomalies)

### Examine Dataset Slices

In [36]:
from tensorflow_data_validation.utils import slicing_util
race_slice_fn = slicing_util.get_feature_value_slicer(features = {'race':None})
sex_slice_fn = slicing_util.get_feature_value_slicer(features = {'sex':None})

In [38]:
race_slice_stats = tfdv.StatsOptions(
    schema = schema,
    slice_functions=[race_slice_fn],
    infer_type_from_schema=True
)

sex_slice_stats = tfdv.StatsOptions(
    schema=schema,
    slice_functions=[sex_slice_fn],
    infer_type_from_schema= True
)

In [39]:
SEX_SLICE_CSV_PATH = "data/sex_slice.csv"
RACE_SLICE_CSV_PATH = "data/race_slice.csv"

In [40]:
train_df.to_csv(SEX_SLICE_CSV_PATH)
train_df.to_csv(RACE_SLICE_CSV_PATH)

In [41]:
# Calculate statistice for the feature 'sex' slice data
sex_slice_stat_option = tfdv.generate_statistics_from_csv(SEX_SLICE_CSV_PATH, stats_options=sex_slice_stats)

# Calculate statistice for the feature 'race' slice data
race_slice_stat_option = tfdv.generate_statistics_from_csv(RACE_SLICE_CSV_PATH, stats_options=race_slice_stats)





Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [42]:
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

# Convert `Male` statistics (index=1) to the correct type and get the dataset name
male_stats_list = DatasetFeatureStatisticsList()
male_stats_list.datasets.extend([sex_slice_stat_option.datasets[1]])
male_stats_name = sex_slice_stat_option.datasets[1].name

# Convert `Female` statistics (index=2) to the correct type and get the dataset name
female_stats_list = DatasetFeatureStatisticsList()
female_stats_list.datasets.extend([sex_slice_stat_option.datasets[2]])
female_stats_name = sex_slice_stat_option.datasets[2].name

In [44]:
tfdv.visualize_statistics(
    lhs_statistics=male_stats_list,
    rhs_statistics=female_stats_list,
    lhs_name= male_stats_name,
    rhs_name= female_stats_name
)

In [46]:
print(f'Datasets generated: {[sliced.name for sliced in race_slice_stat_option.datasets]}')

print(f'Type of sliced_stats elements: {type(race_slice_stat_option.datasets[0])}')

Datasets generated: ['All Examples', 'race_White', 'race_Black', 'race_Asian-Pac-Islander', 'race_Amer-Indian-Eskimo', 'race_Other']
Type of sliced_stats elements: <class 'tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatistics'>


In [48]:
# Convert `white` statistics (index=1) to the correct type and get the dataset name
white_stats_list = DatasetFeatureStatisticsList()
white_stats_list.datasets.extend([race_slice_stat_option.datasets[1]])
white_stats_name = race_slice_stat_option.datasets[1].name

# Convert `black` statistics (index=2) to the correct type and get the dataset name
black_stats_list = DatasetFeatureStatisticsList()
black_stats_list.datasets.extend([race_slice_stat_option.datasets[2]])
black_stats_name = race_slice_stat_option.datasets[2].name

# Convert `asian` statistics (index=3) to the correct type and get the dataset name
asian_stats_list = DatasetFeatureStatisticsList()
asian_stats_list.datasets.extend([race_slice_stat_option.datasets[3]])
asian_stats_name = race_slice_stat_option.datasets[3].name

# Convert `indian` statistics (index=3) to the correct type and get the dataset name
indian_stats_list = DatasetFeatureStatisticsList()
indian_stats_list.datasets.extend([race_slice_stat_option.datasets[4]])
indian_stats_name = race_slice_stat_option.datasets[4].name

# Convert `Female` statistics (index=3) to the correct type and get the dataset name
other_stats_list = DatasetFeatureStatisticsList()
other_stats_list.datasets.extend([race_slice_stat_option.datasets[5]])
other_stats_name = race_slice_stat_option.datasets[5].name

In [50]:
tfdv.visualize_statistics(
    lhs_statistics=white_stats_list,
    rhs_statistics=black_stats_list,
    lhs_name= white_stats_name,
    rhs_name= black_stats_name
)