In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import schema_utils
from tensorflow_transform.tf_metadata import dataset_metadata
import tempfile

print(f'TensorFlow version: {tf.__version__}')
print(f'TFX Transform version: {tft.__version__}')

TensorFlow version: 2.13.1
TFX Transform version: 1.14.0


In [2]:
train_data_file = "data/train_cleaned.csv"
train_df =  pd.read_csv(train_data_file)
test_df = pd.read_csv("data/test_cleaned.csv")

In [3]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#train_df.to_dict()

In [5]:
test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
CATEGORICAL_FEATURE_KEYS = [
    'workclass',
    'education',
    'marital_status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native_country',
]

NUMERIC_FEATURE_KEYS = [
    'age',
    'fnlwgt',
    'capital_gain',
    'capital_loss',
    'hours_per_week',
    'education_num',
]

LABEL_KEY = 'income'


RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string))
     for name in CATEGORICAL_FEATURE_KEYS] +
    [(name, tf.io.FixedLenFeature([], tf.float32))
     for name in NUMERIC_FEATURE_KEYS] +
    [(LABEL_KEY, tf.io.FixedLenFeature([], tf.string))]
)

SCHEMA = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)
).schema

raw_data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC)
)

In [9]:
print(raw_data_metadata._schema)
print("\n")
print(SCHEMA)

feature {
  name: "age"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "capital_gain"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "capital_loss"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "education"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "education_num"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "fnlwgt"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "hours_per_week"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "income"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "marital_status"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "native_country"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }

In [12]:
type(SCHEMA)

tensorflow_metadata.proto.v0.schema_pb2.Schema

In [10]:
import apache_beam as beam
from tfx_bsl.public import tfxio
from tfx_bsl.coders.example_coder import RecordBatchToExamples

In [11]:
ORDERED_CSV_COLUMNS = list(train_df.columns)
ORDERED_CSV_COLUMNS

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income']

In [None]:
from pathlib import Path

In [13]:
csv_tfxio = tfxio.CsvTFXIO(train_data_file,
                           telemetry_descriptors=[],
                           column_names=ORDERED_CSV_COLUMNS,
                           schema=SCHEMA)

pipeline = beam.Pipeline()
raw_data = pipeline | 'TFXIORead' >> csv_tfxio.BeamSource()



pipeline = beam.Pipeline()

csv_tfxio = tfxio.BeamRecordCsvTFXIO(
    physical_format='text', column_names=ORDERED_CSV_COLUMNS, schema=SCHEMA)

raw_data = (
    pipeline
    | 'ReadTrainData' >> beam.io.ReadFromText(
        train_data_file, coder=beam.coders.BytesCoder())
    | 'FixCommasTrainData' >> beam.Map(
        lambda line: line.replace(b', ', b','))
    | 'DecodeTrainData' >> csv_tfxio.BeamSource())

In [10]:
raw_data

<PCollection[[9]: TFXIORead/RawRecordToRecordBatch/CollectRecordBatchTelemetry/ProfileRecordBatches.None] at 0x2c8c3b59870>

In [14]:
# Features with string data types that will be converted to indices
CATEGORICAL_FEATURE_KEYS = [
    'education', 'marital_status', 'occupation', 'race', 'relationship', 'workclass', 'sex', 'native_country'
]

# Numerical features that are marked as continuous
NUMERIC_FEATURE_KEYS = ['fnlwgt', 
                        'education_num',
                        'capital_gain', 'capital_loss', 'hours_per_week'
                       ]

# Feature that can be grouped into buckets
BUCKET_FEATURE_KEYS = ['age']

# Number of buckets used by tf.transform for encoding each bucket feature.
FEATURE_BUCKET_COUNT = {'age': 4}

# Feature that the model will predict
LABEL_KEY = 'income'

In [15]:
# Utility function for renaming the feature
def transformed_name(key):
    return key + '_xf'
    
def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns.
    Args: 
        inputs (): 

    Return:
        dictionary
        
    """
    outputs = {}
    # Scale these fatures to be [0,1]
    for key in NUMERIC_FEATURE_KEYS:
        outputs[transformed_name(key)] = tft.scale_to_0_1(inputs[key])

    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[transformed_name(key)] = tft.compute_and_apply_vocabulary(inputs[key])

    for key in BUCKET_FEATURE_KEYS:
        outputs[transformed_name(key)] = tft.bucketize(inputs[key], FEATURE_BUCKET_COUNT[key])


    outputs[transformed_name(LABEL_KEY)] = tft.compute_and_apply_vocabulary(inputs[LABEL_KEY])
    
    
    return outputs

In [14]:
import pprint

In [None]:
#train_df.to_dict('records')

In [18]:

# Ignore the warnings
tf.get_logger().setLevel('ERROR')

# a temporary directory is needed when analyzing the data
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    
    # define the pipeline using Apache Beam syntax
    transformed_dataset, transform_fn = (
        
        # analyze and transform the dataset using the preprocessing function
        (train_df.to_dict('records'), raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn, output_record_batches=True)
    )

# unpack the transformed dataset
transformed_data, transformed_metadata = transformed_dataset






In [24]:
# print the results
#print('\nRaw data:\n{}\n'.format(pprint.pformat(train_df.to_)))
print('Transformed data:\n{}'.format(pprint.pformat(transformed_data[:10])))

Transformed data:
[{'age_xf': 2,
  'capital_gain_xf': 0.02174021676182747,
  'capital_loss_xf': 0.0,
  'education_num_xf': 0.800000011920929,
  'education_xf': 2,
  'fnlwgt_xf': 0.044301897287368774,
  'hours_per_week_xf': 0.3979591727256775,
  'income_xf': 0,
  'marital_status_xf': 1,
  'native_country_xf': 0,
  'occupation_xf': 3,
  'race_xf': 0,
  'relationship_xf': 1,
  'sex_xf': 0,
  'workclass_xf': 4},
 {'age_xf': 3,
  'capital_gain_xf': 0.0,
  'capital_loss_xf': 0.0,
  'education_num_xf': 0.800000011920929,
  'education_xf': 2,
  'fnlwgt_xf': 0.048237595707178116,
  'hours_per_week_xf': 0.12244898080825806,
  'income_xf': 0,
  'marital_status_xf': 0,
  'native_country_xf': 0,
  'occupation_xf': 2,
  'race_xf': 0,
  'relationship_xf': 0,
  'sex_xf': 0,
  'workclass_xf': 1},
 {'age_xf': 2,
  'capital_gain_xf': 0.0,
  'capital_loss_xf': 0.0,
  'education_num_xf': 0.5333333611488342,
  'education_xf': 0,
  'fnlwgt_xf': 0.13811343908309937,
  'hours_per_week_xf': 0.3979591727256775,


In [11]:
# Create constant module
census_constant_module = "transformation/income_constant.py"

In [12]:
%%writefile {census_constant_module}
# Features with string data types that will be converted to indices
CATEGORICAL_FEATURE_KEYS = [
    'education', 'marital_status', 'occupation', 'race', 'relationship', 'workclass', 'sex', 'native_country'
]

# Numerical features that are marked as continuous
NUMERIC_FEATURE_KEYS = ['fnlwgt', 
                        #'education_num',
                        'capital_gain', 'capital_loss', 'hours_per_week'
                       ]

# Feature that can be grouped into buckets
BUCKET_FEATURE_KEYS = ['age']

# Number of buckets used by tf.transform for encoding each bucket feature.
FEATURE_BUCKET_COUNT = {'age': 4}

# Feature that the model will predict
LABEL_KEY = 'income'

# Utility function for renaming the feature
def transformed_name(key):
    return key + '_xf'

Overwriting transformation/income_constant.py


In [13]:
census_transform_module_file = 'transformation/income_transform.py'

In [14]:
%%writefile {census_transform_module_file}
import income_constant
import tensorflow as tf
import tensorflow_transform as tft

# Unpack the contents of the constants module
_NUMERIC_FEATURE_KEYS = income_constant.NUMERIC_FEATURE_KEYS
_CATEGORICAL_FEATURE_KEYS = income_constant.CATEGORICAL_FEATURE_KEYS
_BUCKET_FEATURE_KEYS = income_constant.BUCKET_FEATURE_KEYS
_FEATURE_BUCKET_COUNT = income_constant.FEATURE_BUCKET_COUNT
_LABEL_KEY = income_constant.LABEL_KEY
_transformed_name = income_constant.transformed_name

def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns.
    Args: 
        inputs (): 

    Return:
        dictionary
        
    """
    outputs = {}
    # Scale these fatures to be [0,1]
    for key in _NUMERIC_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_0_1(inputs[key])

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(inputs[key])

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(inputs[key], _FEATURE_BUCKET_COUNT[key])


    outputs[_transformed_name(_LABEL_KEY)] = tft.compute_and_apply_vocabulary(inputs[_LABEL_KEY])
    
    
    return outputs

Overwriting transformation/income_transform.py


In [16]:
raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig())

In [17]:
#from transformation.income_transform import preprocessing_fn

# ignor Warnings
tf.get_logger().setLevel('ERROR')

# create a temp directory to analyze the data
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_data, transform_function = (
        raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn=preprocessing_fn, output_record_batches=True)
    )

In [18]:
output_dir = tempfile.mkdtemp()

In [20]:
transformed_data, transformed_metadata = transformed_data

In [22]:
transformed_data

<PCollection[[17]: AnalyzeAndTransformDataset/TransformDataset/ConvertToRecordBatch.None] at 0x21a4b23b430>

In [25]:
#transformed_data, _ = transformed_data
import os
_ = (
    transformed_data
    | 'EncodeTrainData' >>
    beam.FlatMapTuple(lambda batch, _: RecordBatchToExamples(batch))
    | 'WriteTrainData' >> beam.io.WriteToTFRecord(
        os.path.join(output_dir , 'transformed.tfrecord')))

In [26]:
_ = (
    transform_function
    | 'WriteTransformFn' >> tft_beam.WriteTransformFn(output_dir))