In [1]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

from sklearn.model_selection import train_test_split
from util import add_extra_rows

from tensorflow_metadata.proto.v0 import schema_pb2

print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

2024-03-13 02:19:11.768062: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-13 02:19:11.768379: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 02:19:11.771039: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 02:19:11.808790: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TFDV Version: 1.14.0
Tensorflow Version: 2.16.1


In [3]:
# Read in the training and evaluation datasets
df = pd.read_csv('Clean_1.csv')

# Split the dataset. Do not shuffle for this demo notebook.
train_df, eval_df = train_test_split(df, test_size=0.2, shuffle=False)

In [4]:
# Preview the train set
train_df.head()

Unnamed: 0,overall,verified,reviewTime,asin,reviewText,summary
0,5.0,True,"09 4, 2015",B000K2PJ4K,Great product and price!,Five Stars
1,3.0,True,"05 6, 2015",B000K2PJ4K,Waaay too small. Will use for futur children!,Oops!
2,5.0,True,"05 6, 2015",B000K2PJ4K,Stays vibrant after many washes,Great
3,5.0,True,"05 6, 2015",B000K2PJ4K,Stays vibrant after many washes,Good
4,5.0,True,"05 6, 2015",B000K2PJ4K,My son really likes the pink. Ones which I was...,Great


In [5]:
# add extra rows
eval_df = add_extra_rows(eval_df)

# preview the added rows
eval_df.tail(4)

Unnamed: 0,overall,verified,reviewTime,asin,reviewText,summary,Product_Type
2557,4.0,True,"01 16, 2017",B01HI8V10E,Nice pictures and some good recipes. Might co...,PRETTY GOOD,
2558,7.0,True,"09 3, 2015",B000KPIHQ4,Added for testing tfdv,Good,AMAZON_FASHION_5
2559,5.0,True,"09 7, 2017",B000KPIHQ4,Added for testing tfdv,Good,AMAZON_FASHION_6
2560,5.0,True,"09 3, 2022",B000KPIHQ4,Added for testing tfdv,Bad,AMAZON_FASHION_6


In [6]:
# Generate training dataset statistics
train_stats = tfdv.generate_statistics_from_dataframe(train_df)

In [7]:
# Visualize training dataset statistics
tfdv.visualize_statistics(train_stats)

In [10]:
# Infer schema from the computed statistics.
schema = tfdv.infer_schema(statistics=train_stats)

# Display the inferred schema
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'overall',FLOAT,required,,-
'verified',INT,required,,-
'reviewTime',BYTES,required,,-
'asin',BYTES,required,,-
'reviewText',BYTES,required,,-
'summary',BYTES,required,,-
'__index_level_0__',INT,required,,-


In [11]:
# Generate evaluation dataset statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df)

# Compare training with evaluation
tfdv.visualize_statistics(
    lhs_statistics=eval_stats, 
    rhs_statistics=train_stats, 
    lhs_name='EVAL_DATASET', 
    rhs_name='TRAIN_DATASET'
)

In [12]:
# Check evaluation data for errors by validating the evaluation dataset statistics using the reference schema
anomalies =  tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'__index_level_0__',Column dropped,Column is completely missing
'Product_Type',New column,New column (column in data but not in schema)


In [13]:
# Add new value to the domain of the feature `race`
Product_Type_domain = tfdv.get_domain(schema, 'Product_Type')
Product_Type_domain.value.append('AMAZON_FASHION_6')

ValueError: Feature Product_Type not found in the schema.

In [None]:
# Restrict the range of the `age` feature
tfdv.set_domain(schema, 'overall', schema_pb2.FloatDomain(name='overall', min=1.0, max=5.0))

# Display the modified schema. Notice the `Domain` column of `age`.
tfdv.display_schema(schema)

In [None]:
# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

In [None]:
%run tfdv.py