# Data Validation - Diabetic Data

[Tensorflow Data Validation (TFDV)](https://cloud.google.com/solutions/machine-learning/analyzing-and-validating-data-at-scale-for-ml-using-tfx) 

Download files here: https://drive.google.com/file/d/1n8x0UhaadEfyixUCBoyXrgkiwniIm3ZM/view?usp=sharing

[Diabetes 130-US hospitals for years 1999-2008 Data Set](https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008)

In [None]:
!pip install -q tensorflow_data_validation

In [None]:
# Import packages
import os
import pandas as pd
import tensorflow as tf
import tempfile, urllib, zipfile
import tensorflow_data_validation as tfdv


from tensorflow.python.lib.io import file_io
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList, DatasetFeatureStatistics

# Set TF's logger to only display errors to avoid internal warnings being shown
tf.get_logger().setLevel('ERROR')

In [None]:
# Read CSV data into a dataframe and recognize the missing data that is encoded with '?' string as NaN
df = pd.read_csv('./diabetic_data.csv', header=0, na_values = '?')

# Preview the dataset
df.head()

In [None]:
def prepare_data_splits_from_dataframe(df):
    '''
    Splits a Pandas Dataframe into training, evaluation and serving sets.

    Parameters:
            df : pandas dataframe to split

    Returns:
            train_df: Training dataframe(70% of the entire dataset)
            eval_df: Evaluation dataframe (15% of the entire dataset) 
            serving_df: Serving dataframe (15% of the entire dataset, label column dropped)
    '''
    
    # 70% of records for generating the training set
    train_len = int(len(df) * 0.7)
    
    # Remaining 30% of records for generating the evaluation and serving sets
    eval_serv_len = len(df) - train_len
    
    # Half of the 30%, which makes up 15% of total records, for generating the evaluation set
    eval_len = eval_serv_len // 2
    
    # Remaining 15% of total records for generating the serving set
    serv_len = eval_serv_len - eval_len 
 
    # Split the dataframe into the three subsets
    train_df = df.iloc[:train_len].reset_index(drop=True)
    eval_df = df.iloc[train_len: train_len + eval_len].reset_index(drop=True)
    serving_df = df.iloc[train_len + eval_len: train_len + eval_len + serv_len].reset_index(drop=True)
 
    # Serving data emulates the data that would be submitted for predictions, so it should not have the label column.
    serving_df = serving_df.drop(['readmitted'], axis=1)

    return train_df, eval_df, serving_df

In [None]:
# Split the datasets
train_df, eval_df, serving_df = prepare_data_splits_from_dataframe(df)
print('Training dataset has {} records\nValidation dataset has {} records\nServing dataset has {} records'.format(len(train_df),len(eval_df),len(serving_df)))

In [None]:
# Define features to remove
features_to_remove = {'encounter_id', 'patient_nbr'}

# Collect features to include while computing the statistics
approved_cols = [col for col in df.columns if (col not in features_to_remove)]

# Instantiate a StatsOptions class and define the feature_allowlist property
stats_options = tfdv.StatsOptions(feature_allowlist=approved_cols)

# Review the features to generate the statistics
for feature in stats_options.feature_allowlist:
    print(feature)

In [None]:
train_stats = tfdv.generate_statistics_from_dataframe(train_df, stats_options)

In [None]:
# TEST CODE

# get the number of features used to compute statistics
print(f"Number of features used: {len(train_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples used: {train_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {train_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {train_stats.datasets[0].features[-1].path.step[0]}")

In [None]:
tfdv.visualize_statistics(train_stats)

In [None]:
# Infer the data schema by using the training statistics that you generated
schema = tfdv.infer_schema(train_stats)

# Display the data schema
tfdv.display_schema(schema)

In [None]:
# TEST CODE

# Check number of features
print(f"Number of features in schema: {len(schema.feature)}")

# Check domain name of 2nd feature
print(f"Second feature in schema: {list(schema.feature)[1].domain}")

In [None]:
# Generate evaluation dataset statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df, stats_options=stats_options)

# Compare evaluation data with training data 
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [None]:
# TEST CODE

# get the number of features used to compute statistics
print(f"Number of features: {len(eval_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples: {eval_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {eval_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {eval_stats.datasets[0].features[-1].path.step[0]}")

In [None]:
train_df["glimepiride-pioglitazone"].describe()

In [None]:
eval_df["glimepiride-pioglitazone"].describe()

It is possible but highly inefficient to visually inspect and determine all the anomalies. So, let's instead use TFDV functions to detect and display these.

You can use the function [`tfdv.validate_statistics()`](https://www.tensorflow.org/tfx/data_validation/api_docs/python/tfdv/validate_statistics) for detecting anomalies and [`tfdv.display_anomalies()`](https://www.tensorflow.org/tfx/data_validation/api_docs/python/tfdv/display_anomalies) for displaying them.

The `validate_statistics()` method has two required arguments:
- an instance of `DatasetFeatureStatisticsList`
- an instance of `Schema`

Fill in the following graded function which, given the statistics and schema, displays the anomalies found.

In [None]:
def calculate_and_display_anomalies(statistics, schema):
    '''
    Calculate and display anomalies.

            Parameters:
                    statistics : Data statistics in statistics_pb2.DatasetFeatureStatisticsList format
                    schema : Data schema in schema_pb2.Schema format

            Returns:
                    display of calculated anomalies
    '''
    anomalies = tfdv.validate_statistics(statistics, schema)
    tfdv.display_anomalies(anomalies)

You should see detected anomalies in the `medical_specialty` and `glimepiride-pioglitazone` features by running the cell below.

In [None]:
# Check evaluation data for errors by validating the evaluation data staticss using the previously inferred schema
calculate_and_display_anomalies(eval_stats, schema=schema)

In [None]:
# Get the domain associated with the input feature, glimepiride-pioglitazone, from the schema
glimepiride_pioglitazone_domain = tfdv.get_domain(schema, 'glimepiride-pioglitazone') 

# HINT: Append the missing value 'Steady' to the domain
glimepiride_pioglitazone_domain.value.append('Steady')

# Get the domain associated with the input feature, medical_specialty, from the schema
medical_specialty_domain = tfdv.get_domain(schema, 'medical_specialty') 

# HINT: Append the missing value 'Neurophysiology' to the domain
medical_specialty_domain.value.append('Neurophysiology')

# HINT: Re-calculate and re-display anomalies with the new schema
calculate_and_display_anomalies(eval_stats, schema=schema)

In [None]:
# Define a new statistics options by the tfdv.StatsOptions class for the serving data by passing the previously inferred schema
options = tfdv.StatsOptions(schema=schema, 
                            infer_type_from_schema=True, 
                            feature_allowlist=approved_cols)

In [None]:
serving_stats = tfdv.generate_statistics_from_dataframe(serving_df, stats_options=options)
calculate_and_display_anomalies(serving_stats, schema=schema)

In [None]:
# This relaxes the minimum fraction of values that must come from the domain for the feature.

# Get the feature and relax to match 90% of the domain
payer_code = tfdv.get_feature(schema, 'payer_code')
payer_code.distribution_constraints.min_domain_mass = 0.9 

# Get the feature and relax to match 90% of the domain
medical_specialty = tfdv.get_feature(schema, 'medical_specialty')
medical_specialty.distribution_constraints.min_domain_mass = 0.9 

# Detect anomalies with the updated constraints
calculate_and_display_anomalies(serving_stats, schema=schema)

In [None]:
tfdv.display_schema(schema)

In [None]:
def modify_domain_of_features(features_list, schema, to_domain_name):
    '''
    Modify a list of features' domains.

            Parameters:
                    features_list : Features that need to be modified
                    schema: Inferred schema
                    to_domain_name : Target domain to be transferred to the features list

            Returns:
                    schema: new schema
    '''
    for feature in features_list:
        tfdv.set_domain(schema, feature, to_domain_name)

    return schema

In [None]:
domain_change_features = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
                          'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 
                          'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
                          'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 
                          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']


# Infer new schema by using your modify_domain_of_features function 
# and the defined domain_change_features feature list
schema = modify_domain_of_features(domain_change_features, schema, 'metformin')

# Display new schema
tfdv.display_schema(schema)

In [None]:
# TEST CODE

# check that the domain of some features are now switched to `metformin`
print(f"Domain name of 'chlorpropamide': {tfdv.get_feature(schema, 'chlorpropamide').domain}")
print(f"Domain values of 'chlorpropamide': {tfdv.get_domain(schema, 'chlorpropamide').value}")
print(f"Domain name of 'repaglinide': {tfdv.get_feature(schema, 'repaglinide').domain}")
print(f"Domain values of 'repaglinide': {tfdv.get_domain(schema, 'repaglinide').value}")
print(f"Domain name of 'nateglinide': {tfdv.get_feature(schema, 'nateglinide').domain}")
print(f"Domain values of 'nateglinide': {tfdv.get_domain(schema, 'nateglinide').value}")

In [None]:
calculate_and_display_anomalies(serving_stats, schema=schema)

In [None]:
# All features are by default in both TRAINING and SERVING environments.
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

In [None]:
# Specify that 'readmitted' feature is not in SERVING environment.
tfdv.get_feature(schema, 'readmitted').not_in_environment.append('SERVING')

# inferred schema and the SERVING environment parameter.
serving_anomalies_with_env = tfdv.validate_statistics(serving_stats, schema, environment='SERVING')

In [None]:
# Display anomalies
tfdv.display_anomalies(serving_anomalies_with_env)

In [None]:
# Calculate skew for the diabetesMed feature
diabetes_med = tfdv.get_feature(schema, 'diabetesMed')
diabetes_med.skew_comparator.infinity_norm.threshold = 0.03 # domain knowledge helps to determine this threshold

# Calculate drift for the payer_code feature
payer_code = tfdv.get_feature(schema, 'payer_code')
payer_code.drift_comparator.infinity_norm.threshold = 0.03 # domain knowledge helps to determine this threshold

# Calculate anomalies
skew_drift_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

# Display anomalies
tfdv.display_anomalies(skew_drift_anomalies)

In [None]:
def split_datasets(dataset_list):
    '''
    split datasets.

            Parameters:
                    dataset_list: List of datasets to split

            Returns:
                    datasets: sliced data
    '''
    datasets = []
    for dataset in dataset_list.datasets:
        proto_list = DatasetFeatureStatisticsList()
        proto_list.datasets.extend([dataset])
        datasets.append(proto_list)
    return datasets


def display_stats_at_index(index, datasets):
    '''
    display statistics at the specified data index

            Parameters:
                    index : index to show the anomalies
                    datasets: split data

            Returns:
                    display of generated sliced data statistics at the specified index
    '''
    if index < len(datasets):
        print(datasets[index].datasets[0].name)
        tfdv.visualize_statistics(datasets[index])

In [None]:
def sliced_stats_for_slice_fn(slice_fn, approved_cols, dataframe, schema):
    '''
    generate statistics for the sliced data.

            Parameters:
                    slice_fn : slicing definition
                    approved_cols: list of features to pass to the statistics options
                    dataframe: pandas dataframe to slice
                    schema: the schema

            Returns:
                    slice_info_datasets: statistics for the sliced dataset
    '''
    # Set the StatsOptions
    slice_stats_options = tfdv.StatsOptions(schema=schema,
                                            slice_functions=[slice_fn],
                                            infer_type_from_schema=True,
                                            feature_allowlist=approved_cols)
    
    # Convert Dataframe to CSV since `slice_functions` works only with `tfdv.generate_statistics_from_csv`
    CSV_PATH = 'slice_sample.csv'
    dataframe.to_csv(CSV_PATH)
    
    # Calculate statistics for the sliced dataset
    sliced_stats = tfdv.generate_statistics_from_csv(CSV_PATH, stats_options=slice_stats_options)
    
    # Split the dataset using the previously defined split_datasets function
    slice_info_datasets = split_datasets(sliced_stats)
    
    return slice_info_datasets

In [None]:
# Generate slice function for the `medical_speciality` feature
slice_fn = slicing_util.get_feature_value_slicer(features={'medical_specialty': None})

# Generate stats for the sliced dataset
slice_datasets = sliced_stats_for_slice_fn(slice_fn, approved_cols, dataframe=train_df, schema=schema)

# Print name of slices for reference
print(f'Statistics generated for:\n')
print('\n'.join([sliced.datasets[0].name for sliced in slice_datasets]))

# Display at index 10, which corresponds to the slice named `medical_specialty_Gastroenterology`
display_stats_at_index(10, slice_datasets) 

In [None]:
# Create output directory
OUTPUT_DIR = "output"
file_io.recursive_create_dir(OUTPUT_DIR)

# Use TensorFlow text output format pbtxt to store the schema
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')

# write_schema_text function expect the defined schema and output path as parameters
tfdv.write_schema_text(schema, schema_file) 

# Tensorflow Data Validation (TFDV) - Income Data

[Census Income Dataset](http://archive.ics.uci.edu/ml/datasets/Census+Income)

[in this data description file.](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)


In [None]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow_metadata.proto.v0 import schema_pb2

print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

In [None]:
# Read in the training and evaluation datasets
df = pd.read_csv('./adult.data', skipinitialspace=True)

# Split the dataset. Do not shuffle for this demo notebook.
train_df, eval_df = train_test_split(df, test_size=0.2, shuffle=False)

In [None]:
# Preview the train set
train_df.head()

In [None]:
# Preview the eval set
eval_df.head()

In [None]:
#@title helper function

def add_extra_rows(df):
    rows = [
        {
            'age': 46, 
            'fnlwgt': 257473, 
            'education': 'Bachelors', 
            'education-num': 8,
            'marital-status': 'Married-civ-spouse', 
            'occupation': 'Plumber', 
            'relationship': 'Husband', 
            'race': 'Other', 
            'sex': 'Male',
            'capital-gain': 1000, 
            'capital-loss': 0, 
            'hours-per-week': 41, 
            'native-country': 'Australia',
            'label': '>50K'
        },
        {
            'age': 0, 
            'workclass': 'Private', 
            'fnlwgt': 257473, 
            'education': 'Masters', 
            'education-num': 8,
            'marital-status': 'Married-civ-spouse', 
            'occupation': 'Adm-clerical', 
            'relationship': 'Wife', 
            'race': 'Asian', 
            'sex': 'Female',
            'capital-gain': 0, 
            'capital-loss': 0, 
            'hours-per-week': 40, 
            'native-country': 'Pakistan',
            'label': '>50K'
        },
        {
            'age': 1000, 
            'workclass': 'Private', 
            'fnlwgt': 257473, 
            'education': 'Masters', 
            'education-num': 8,
            'marital-status': 'Married-civ-spouse', 
            'occupation': 'Prof-specialty', 
            'relationship': 'Husband', 
            'race': 'Black', 
            'sex': 'Male',
            'capital-gain': 0, 
            'capital-loss': 0, 
            'hours-per-week': 20, 
            'native-country': 'Cameroon',
            'label': '<=50K'
        },
        {
            'age': 25, 
            'workclass': '?', 
            'fnlwgt': 257473, 
            'education': 'Masters', 
            'education-num': 8,
            'marital-status': 'Married-civ-spouse', 
            'occupation': 'gamer', 
            'relationship': 'Husband', 
            'race': 'Asian', 
            'sex': 'Female',
            'capital-gain': 0, 
            'capital-loss': 0, 
            'hours-per-week': 50, 
            'native-country': 'Mongolia',
            'label': '<=50K'
        }
    ]
    
    df = df.append(rows, ignore_index=True)
    
    return df

In [None]:
# add extra rows
eval_df = add_extra_rows(eval_df)

# preview the added rows
eval_df.tail(4)

In [None]:
# Generate training dataset statistics
train_stats = tfdv.generate_statistics_from_dataframe(train_df)

In [None]:
# Visualize training dataset statistics
tfdv.visualize_statistics(train_stats)

In [None]:
# Infer schema from the computed statistics.
schema = tfdv.infer_schema(statistics=train_stats)

# Display the inferred schema
tfdv.display_schema(schema)

In [None]:
# Generate evaluation dataset statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df)

# Compare training with evaluation
tfdv.visualize_statistics(
    lhs_statistics=eval_stats, 
    rhs_statistics=train_stats, 
    lhs_name='EVAL_DATASET', 
    rhs_name='TRAIN_DATASET'
)

In [None]:
# filter the age range
eval_df = eval_df[eval_df['age'] > 16]
eval_df = eval_df[eval_df['age'] < 91]

# drop missing values
eval_df.dropna(inplace=True)

In [None]:
# Generate evaluation dataset statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df)

# Compare training with evaluation
tfdv.visualize_statistics(
    lhs_statistics=eval_stats, 
    rhs_statistics=train_stats, 
    lhs_name='EVAL_DATASET', 
    rhs_name='TRAIN_DATASET'
)

In [None]:
# Check evaluation data for errors by validating the evaluation dataset statistics using the reference schema
anomalies =  tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

In [None]:
# Relax the minimum fraction of values that must come from the domain for the feature `native-country`
country_feature = tfdv.get_feature(schema, 'native-country')
country_feature.distribution_constraints.min_domain_mass = 0.9

# Relax the minimum fraction of values that must come from the domain for the feature `occupation`
occupation_feature = tfdv.get_feature(schema, 'occupation')
occupation_feature.distribution_constraints.min_domain_mass = 0.9

In [None]:
# Add new value to the domain of the feature `race`
race_domain = tfdv.get_domain(schema, 'race')
race_domain.value.append('Asian')

In [None]:
# Restrict the range of the `age` feature
tfdv.set_domain(schema, 'age', schema_pb2.IntDomain(name='age', min=17, max=90))

# Display the modified schema. Notice the `Domain` column of `age`.
tfdv.display_schema(schema)

In [None]:
# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

In [None]:
from tensorflow_data_validation.utils import slicing_util

slice_fn = slicing_util.get_feature_value_slicer(features={'sex': None})

In [None]:
# Declare stats options
slice_stats_options = tfdv.StatsOptions(schema=schema,
                                        slice_functions=[slice_fn],
                                        infer_type_from_schema=True)

In [None]:
# Convert dataframe to CSV since `slice_functions` works only with `tfdv.generate_statistics_from_csv`
CSV_PATH = 'slice_sample.csv'
train_df.to_csv(CSV_PATH)

# Calculate statistics for the sliced dataset
sliced_stats = tfdv.generate_statistics_from_csv(CSV_PATH, stats_options=slice_stats_options)

In [None]:
print(f'Datasets generated: {[sliced.name for sliced in sliced_stats.datasets]}')

print(f'Type of sliced_stats elements: {type(sliced_stats.datasets[0])}')

In [None]:
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

# Convert `Male` statistics (index=1) to the correct type and get the dataset name
male_stats_list = DatasetFeatureStatisticsList()
male_stats_list.datasets.extend([sliced_stats.datasets[1]])
male_stats_name = sliced_stats.datasets[1].name

# Convert `Female` statistics (index=2) to the correct type and get the dataset name
female_stats_list = DatasetFeatureStatisticsList()
female_stats_list.datasets.extend([sliced_stats.datasets[2]])
female_stats_name = sliced_stats.datasets[2].name

# Visualize the two slices side by side
tfdv.visualize_statistics(
    lhs_statistics=male_stats_list,
    rhs_statistics=female_stats_list,
    lhs_name=male_stats_name,
    rhs_name=female_stats_name
)