In [1]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

from sklearn.model_selection import train_test_split
from util import add_extra_rows

from tensorflow_metadata.proto.v0 import schema_pb2

print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

2024-03-13 01:43:53.733923: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-13 01:43:53.734162: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 01:43:53.736056: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 01:43:53.762682: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TFDV Version: 1.14.0
Tensorflow Version: 2.16.1


In [2]:
# Read in the training and evaluation datasets
df = pd.read_csv('clean1.csv')

# Split the dataset. Do not shuffle for this demo notebook.
train_df, eval_df = train_test_split(df, test_size=0.2, shuffle=False)

In [3]:
# Preview the train set
train_df.head()

Unnamed: 0,overall,verified,reviewTime,asin,reviewText,summary,Product_Type
0,5.0,True,"09 4, 2015",B000K2PJ4K,Great product and price!,Five Stars,AMAZON_FASHION_5
1,3.0,True,"05 6, 2015",B000K2PJ4K,Waaay too small. Will use for futur children!,Oops!,AMAZON_FASHION_5
2,5.0,True,"05 6, 2015",B000K2PJ4K,Stays vibrant after many washes,Great,AMAZON_FASHION_5
3,5.0,True,"05 6, 2015",B000K2PJ4K,Stays vibrant after many washes,Good,AMAZON_FASHION_5
4,5.0,True,"05 6, 2015",B000K2PJ4K,My son really likes the pink. Ones which I was...,Great,AMAZON_FASHION_5


In [4]:
# add extra rows
eval_df = add_extra_rows(eval_df)

# preview the added rows
eval_df.tail(4)

Unnamed: 0,overall,verified,reviewTime,asin,reviewText,summary,Product_Type
90852,3.0,True,"01 7, 2018",B01HJBWFBO,Core is very hard and does not transfer color ...,Color on paper is not very saturated due to ha...,Arts_Crafts_and_Sewing_5
90853,7.0,True,"09 3, 2015",B000KPIHQ4,Added for testing tfdv,Good,AMAZON_FASHION_5
90854,5.0,True,"09 7, 2017",B000KPIHQ4,Added for testing tfdv,Good,AMAZON_FASHION_6
90855,5.0,True,"09 3, 2022",B000KPIHQ4,Added for testing tfdv,Bad,AMAZON_FASHION_6


In [5]:
# Generate training dataset statistics
train_stats = tfdv.generate_statistics_from_dataframe(train_df)

In [6]:
# Visualize training dataset statistics
tfdv.visualize_statistics(train_stats)

In [None]:
# Infer schema from the computed statistics.
schema = tfdv.infer_schema(statistics=train_stats)

# Display the inferred schema
tfdv.display_schema(schema)

In [None]:
# Generate evaluation dataset statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df)

# Compare training with evaluation
tfdv.visualize_statistics(
    lhs_statistics=eval_stats, 
    rhs_statistics=train_stats, 
    lhs_name='EVAL_DATASET', 
    rhs_name='TRAIN_DATASET'
)

In [None]:
# Check evaluation data for errors by validating the evaluation dataset statistics using the reference schema
anomalies =  tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

In [None]:
# Add new value to the domain of the feature `race`
Product_Type_domain = tfdv.get_domain(schema, 'Product_Type')
Product_Type_domain.value.append('AMAZON_FASHION_6')

In [None]:
# Restrict the range of the `age` feature
tfdv.set_domain(schema, 'overall', schema_pb2.FloatDomain(name='overall', min=1.0, max=5.0))

# Display the modified schema. Notice the `Domain` column of `age`.
tfdv.display_schema(schema)

In [None]:
# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)