In [None]:
!pip install tensorflow-data-validation
!pip install -q tensorflow_data_validation[visualization]

In [None]:
import pandas as pd
import tensorflow_data_validation as tfdv
import sys
import warnings
warnings.filterwarnings('ignore')

assert sys.version_info.major == 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'

###  Load the Consumer Spending Dataset

> Graduated = Whether or not the person is a college graduate  
> Work Experience = The number of years in the workforce  
> Family Size = The size of the family unit  
> Spending Score = The spending score for consumer spending

In [3]:
score_train = pd.read_csv("data/score_train.csv")
score_train.head()

Unnamed: 0,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
0,No,Healthcare,1.0,4.0,Low
1,Yes,Engineer,,3.0,Average
2,Yes,Engineer,1.0,1.0,Low
3,Yes,Lawyer,0.0,2.0,High
4,Yes,Entertainment,,6.0,High


In [4]:
score_test = pd.read_csv("data/score_test.csv")
score_test.head()

Unnamed: 0,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
0,No,Doctor,0.0,5.0,Average
1,Yes,Entertainment,1.0,4.0,Average
2,No,Lawyer,0.0,5.0,Low
3,Yes,Executive,1.0,5.0,High
4,Yes,Artist,1.0,2.0,Average


In [5]:
score_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Graduated        3964 non-null   object 
 1   Profession       3944 non-null   object 
 2   Work_Experience  3589 non-null   float64
 3   Family_Size      3831 non-null   float64
 4   Spending_Score   4000 non-null   object 
dtypes: float64(2), object(3)
memory usage: 156.4+ KB


#### Describe and Visualize Train Dataset Statistics

#### TFDV generates different types of statistics based on the type of features.

**For numerical features, TFDV computes for every feature:**
* Count of records
* Number of missing (i.e. null values)
* Histogram of values
* Mean and standard deviation
* Minimum and maximum values
* Percentage of zero values

**For categorical features, TFDV provides:**
* Count of values
* Percentage of missing values
* Number of unique values
* Average string length
* Count for each label and its rank

In [6]:
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=score_train)
tfdv.visualize_statistics(train_stats)

Describe and Visualize Test Dataset Statistics

In [7]:
test_stats = tfdv.generate_statistics_from_dataframe(dataframe=score_test)
tfdv.visualize_statistics(test_stats)

#### Describe and Visualize Both Train and Test Statistics Dataset

In [8]:
tfdv.visualize_statistics(
    lhs_statistics=train_stats, rhs_statistics=test_stats,
    lhs_name='Train', rhs_name='Test'
)

#### Train Schema

In [9]:
schema = tfdv.infer_schema(statistics=train_stats)
print(schema)

feature {
  name: "Graduated"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "Graduated"
  presence {
    min_count: 1
  }
}
feature {
  name: "Profession"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "Profession"
  presence {
    min_count: 1
  }
}
feature {
  name: "Work_Experience"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "Family_Size"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "Spending_Score"
  type: BYTES
  domain: "Spending_Score"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
string_domain {
  name: "Graduated"
  value: "No"
  value: "Yes"
}
string_domain {
  name: "Profession"
  value: "Artist"
  value: "Doctor"
  value: "Engineer"
  value: "Entertainment"
  value: "Executive"
  value: "Healthcare"
  value: "Homemaker"
  value: "Lawyer"
  value: "Mar

#### Schema Display

In [10]:
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Graduated',STRING,optional,single,'Graduated'
'Profession',STRING,optional,single,'Profession'
'Work_Experience',FLOAT,optional,single,-
'Family_Size',FLOAT,optional,single,-
'Spending_Score',STRING,required,,'Spending_Score'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Graduated',"'No', 'Yes'"
'Profession',"'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing'"
'Spending_Score',"'Average', 'High', 'Low'"


In [11]:
graduated = tfdv.get_feature(schema, 'Graduated')
graduated.presence.min_fraction = 1.0

profession = tfdv.get_feature(schema, 'Profession')
profession.presence.min_fraction = 1.0

family_size = tfdv.get_feature(schema, 'Family_Size')
family_size.presence.min_fraction = 1.0

In [12]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Graduated',STRING,required,single,'Graduated'
'Profession',STRING,required,single,'Profession'
'Work_Experience',FLOAT,optional,single,-
'Family_Size',FLOAT,required,single,-
'Spending_Score',STRING,required,,'Spending_Score'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Graduated',"'No', 'Yes'"
'Profession',"'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing'"
'Spending_Score',"'Average', 'High', 'Low'"


In [13]:
profession = tfdv.get_domain(schema, 'Profession')
profession

name: "Profession"
value: "Artist"
value: "Doctor"
value: "Engineer"
value: "Entertainment"
value: "Executive"
value: "Healthcare"
value: "Homemaker"
value: "Lawyer"
value: "Marketing"

In [14]:
profession.value.insert(0, 'Self-Employed')
profession.value

['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing']

In [15]:
profession.value.remove('Homemaker')
profession.value

['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing']

In [16]:
tfdv.get_feature(schema, 'Family_Size')

name: "Family_Size"
value_count {
  min: 1
  max: 1
}
type: FLOAT
presence {
  min_fraction: 1.0
  min_count: 1
}

In [17]:
size = tfdv.get_feature(schema, 'Family_Size')
size.type = 2
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Graduated',STRING,required,single,'Graduated'
'Profession',STRING,required,single,'Profession'
'Work_Experience',FLOAT,optional,single,-
'Family_Size',INT,required,single,-
'Spending_Score',STRING,required,,'Spending_Score'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Graduated',"'No', 'Yes'"
'Profession',"'Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing'"
'Spending_Score',"'Average', 'High', 'Low'"


<a style="color:black; text-decoration:none" href="https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/courses/machine_learning/deepdive2/production_ml/solutions/tfdv_basic_spending.ipynb">Solutions</a>