<a href="https://colab.research.google.com/github/Saber-Hosseinzade/TensorFlowExtended_TFX/blob/main/Data_pipeline_components.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a name='1'></a>
## 1 - Imports

In [None]:
# Restart the RunTime after completion of this section
!pip install tfx==1.2

In [None]:
import tensorflow as tf
import tfx

# TFX components
from tfx.components import CsvExampleGen
from tfx.components import ExampleValidator
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Transform
from tfx.components import ImporterNode

# TFX libraries
import tensorflow_data_validation as tfdv
import tensorflow_transform as tft
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

# For performing feature selection
from sklearn.feature_selection import SelectKBest, f_classif

# For feature visualization
import matplotlib.pyplot as plt 
import seaborn as sns

# Utilities
from tensorflow.python.lib.io import file_io
from tensorflow_metadata.proto.v0 import schema_pb2
from google.protobuf.json_format import MessageToDict
from  tfx.proto import example_gen_pb2
from tfx.types import standard_artifacts
import os
import pprint
import tempfile
import pandas as pd

# To ignore warnings from TF
#tf.get_logger().setLevel('ERROR')

# For formatting print statements
pp = pprint.PrettyPrinter()


<a name='2'></a>
## 2 - Load the forest cover type dataset


| Column Name | Variable Type | Units / Range | Description |
| --------- | ------------ | ----- | ------------------- |
| Elevation | quantitative |meters | Elevation in meters |
| Aspect | quantitative | azimuth | Aspect in degrees azimuth |
| Slope | quantitative | degrees | Slope in degrees |
| Horizontal_Distance_To_Hydrology | quantitative | meters | Horz Dist to nearest surface water features |
| Vertical_Distance_To_Hydrology | quantitative | meters | Vert Dist to nearest surface water features |
| Horizontal_Distance_To_Roadways | quantitative | meters | Horz Dist to nearest roadway |
| Hillshade_9am | quantitative | 0 to 255 index | Hillshade index at 9am, summer solstice |
| Hillshade_Noon | quantitative | 0 to 255 index | Hillshade index at noon, summer soltice |
| Hillshade_3pm | quantitative | 0 to 255 index | Hillshade index at 3pm, summer solstice |
| Horizontal_Distance_To_Fire_Points | quantitative | meters | Horz Dist to nearest wildfire ignition points |
| Wilderness_Area (4 binary columns) | qualitative | 0 (absence) or 1 (presence) | Wilderness area designation |
| Soil_Type (40 binary columns) | qualitative | 0 (absence) or 1 (presence) | Soil Type designation |
| Cover_Type (7 types) | integer | 1 to 7 | Forest Cover Type designation |


In [None]:
# Declare paths to the data
data_dir = './data'
training_dir = './data/training'
training_file = './data/training/dataset.csv'

!rm -rf pipeline
!rm -rf data

# Create the directory
!mkdir -p {training_dir}

In [None]:
# download the dataset
!wget -nc https://storage.googleapis.com/workshop-datasets/covertype/full/dataset.csv -P {training_dir}

<a name='3'></a>
## Feature Selection



In [None]:
df = pd.read_csv(training_file)
df.head()

In [None]:
df.info()

In [None]:
df_num = df.copy()

cat_columns = ['Wilderness_Area', 'Soil_Type']

label_column = ['Cover_Type']

df_num.drop(cat_columns, axis=1, inplace=True)
df_num.drop(label_column, axis=1, inplace=True)

df_num.head()

In [None]:
y = df[label_column].values

X = df_num.values

<a name='ex-1'></a>
### Exercise 1: Feature Selection

In [None]:
skb = SelectKBest(score_func=f_classif, k=8)

X_new = skb.fit_transform(X, y)

features_mask = skb.get_support()

reqd_cols = pd.DataFrame({'Columns': df_num.columns, 'Retain': features_mask})
print(reqd_cols)

In [None]:
# Set the paths to the reduced dataset
training_dir_skb = f'{training_dir}/fselect'
training_file_skb = f'{training_dir_skb}/dataset.csv'

# Create the directory
!mkdir -p {training_dir_skb}

In [None]:
# Get the feature names from SelectKBest
skb_features = list(df_num.columns[features_mask])

# Append the categorical and label columns
skb_features = skb_features + cat_columns + label_column

df_skb = df[skb_features]

df_skb.to_csv(training_file_skb, index=False)

df_skb.head()

###  Interactive Context


In [None]:
pipe_dir = './pipeline'

context = InteractiveContext(pipeline_root=pipe_dir)


### ExampleGen



In [None]:
# # NOTE: Uncomment and run this if you get an error saying there are different 
# # headers in the dataset. This is usually because of the notebook checkpoints saved in 
# # that folder.
# !rm -rf {TRAINING_DIR}/.ipynb_checkpoints
# !rm -rf {TRAINING_DIR_FSELECT}/.ipynb_checkpoints
# !rm -rf {SERVING_DIR}/.ipynb_checkpoints

In [None]:
example_gen = CsvExampleGen(input_base=training_dir_skb)

context.run(example_gen)


### StatisticsGen


In [None]:
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'])

context.run(statistics_gen)


In [None]:
# Display the results
context.show(statistics_gen.outputs['statistics'])


### SchemaGen


In [None]:

schema_gen = SchemaGen(
    statistics=statistics_gen.outputs['statistics'])

context.run(schema_gen)


In [None]:
# Visualize the output
context.show(schema_gen.outputs['schema'])


### updating the Schema by TFDV


In [None]:
schema_uri = schema_gen.outputs['schema']._artifacts[0].uri

In [None]:
# Get the schema pbtxt file from the SchemaGen output
schema = tfdv.load_schema_text(os.path.join(schema_uri, 'schema.pbtxt'))

In [None]:
# Set the two `Hillshade` features to have a range of 0 to 255
tfdv.set_domain(schema, 'Hillshade_9am', schema_pb2.IntDomain(name='Hillshade_9am', min=0, max=255))
tfdv.set_domain(schema, 'Hillshade_Noon', schema_pb2.IntDomain(name='Hillshade_Noon', min=0, max=255))

# Set the `Slope` feature to have a range of 0 to 90
tfdv.set_domain(schema, 'Slope', schema_pb2.IntDomain(name='Slope', min=0, max=90))

# Set `Cover_Type` to categorical having minimum value of 0 and maximum value of 6
tfdv.set_domain(schema, 'Cover_Type', schema_pb2.IntDomain(name='Cover_Type', min=0, max=6, is_categorical=True))

tfdv.display_schema(schema=schema)


### Schema Environments (training & serving environments)


In [None]:
serving_dir = f'{data_dir}/serving'
serving_file = f'{serving_dir}/serving_dataset.csv'

!mkdir -p {serving_dir}

In [None]:
serving_data = pd.read_csv(training_file, nrows=100)

serving_data.drop(columns='Cover_Type', inplace=True)

serving_data.to_csv(serving_file, index=False)

del serving_data

In [None]:
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

tfdv.get_feature(schema, 'Cover_Type').not_in_environment.append('SERVING')

In [None]:
# Declare StatsOptions to use the curated schema
stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)

# Compute the statistics of the serving dataset
serving_stats = tfdv.generate_statistics_from_csv(serving_file, stats_options=stats_options)


anomalies = tfdv.validate_statistics(serving_stats, schema=schema, environment='SERVING')
tfdv.display_anomalies(anomalies)

In [None]:
# Declare the path to the updated schema directory
updated_schema_dir = f'{pipe_dir}/updated_schema'

# Create the said directory
!mkdir -p {updated_schema_dir}

# Declare the path to the schema file
schema_file = f'{updated_schema_dir}/schema.pbtxt'

# Save the curated schema to the said file
tfdv.write_schema_text(schema, schema_file)

In [None]:
# Load the schema from the directory we just created
new_schema = tfdv.load_schema_text(schema_file)

tfdv.display_schema(schema=new_schema)

In [None]:
new_schema.default_environment


### Generate new statistics using the updated schema & ImporterNode



In [None]:
user_schema_importer = ImporterNode(
    source_uri=updated_schema_dir,
    artifact_type=standard_artifacts.Schema)

context.run(user_schema_importer, enable_cache=False)

In [None]:
context.show(user_schema_importer.outputs['result'])


####  Statistics with the new schema


In [None]:

statistics_gen_updated = StatisticsGen(
    examples=example_gen.outputs['examples'],
    schema=user_schema_importer.outputs['result'])

context.run(statistics_gen_updated)


In [None]:
context.show(statistics_gen_updated.outputs['statistics'])


#### ExampleValidator for checking anomalies


In [None]:
### START CODE HERE ###

example_validator = ExampleValidator(
    statistics=statistics_gen_updated.outputs['statistics'],
    schema=user_schema_importer.outputs['result'])
    
context.run(example_validator)


In [None]:
# Visualize the results
context.show(example_validator.outputs['anomalies'])


####  Feature engineering


In [None]:
# Set the constants module filename
_cover_constants_module_file = 'cover_constants.py'

In [None]:
%%writefile {_cover_constants_module_file}

SCALE_MINMAX_FEATURE_KEYS = [
        "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology",
    ]

SCALE_01_FEATURE_KEYS = [
        "Hillshade_9am",
        "Hillshade_Noon",
        "Horizontal_Distance_To_Fire_Points",
    ]

SCALE_Z_FEATURE_KEYS = [
        "Elevation",
        "Slope",
        "Horizontal_Distance_To_Roadways",
    ]

VOCAB_FEATURE_KEYS = ["Wilderness_Area"]

HASH_STRING_FEATURE_KEYS = ["Soil_Type"]

LABEL_KEY = "Cover_Type"

# Utility function for renaming the feature
def transformed_name(key):
    return key + '_xf'

In [None]:
# Set the transform module filename
_cover_transform_module_file = 'cover_transform.py'

In [None]:
%%writefile {_cover_transform_module_file}

import tensorflow as tf
import tensorflow_transform as tft

import cover_constants

_SCALE_MINMAX_FEATURE_KEYS = cover_constants.SCALE_MINMAX_FEATURE_KEYS
_SCALE_01_FEATURE_KEYS = cover_constants.SCALE_01_FEATURE_KEYS
_SCALE_Z_FEATURE_KEYS = cover_constants.SCALE_Z_FEATURE_KEYS
_VOCAB_FEATURE_KEYS = cover_constants.VOCAB_FEATURE_KEYS
_HASH_STRING_FEATURE_KEYS = cover_constants.HASH_STRING_FEATURE_KEYS
_LABEL_KEY = cover_constants.LABEL_KEY
_transformed_name = cover_constants.transformed_name

def preprocessing_fn(inputs):

    features_dict = {}

    ### START CODE HERE ###
    for feature in _SCALE_MINMAX_FEATURE_KEYS:
        data_col = inputs[feature] 
        # Transform using scaling of min_max function
        # Hint: Use tft.scale_by_min_max by passing in the respective column
        features_dict[_transformed_name(feature)] = tft.scale_by_min_max(data_col)

    for feature in _SCALE_01_FEATURE_KEYS:
        data_col = inputs[feature] 
        # Transform using scaling of 0 to 1 function
        # Hint: tft.scale_to_0_1
        features_dict[_transformed_name(feature)] = tft.scale_to_0_1(data_col)

    for feature in _SCALE_Z_FEATURE_KEYS:
        data_col = inputs[feature] 
        # Transform using scaling to z score
        # Hint: tft.scale_to_z_score
        features_dict[_transformed_name(feature)] = tft.scale_to_z_score(data_col)

    for feature in _VOCAB_FEATURE_KEYS:
        data_col = inputs[feature] 
        # Transform using vocabulary available in column
        # Hint: Use tft.compute_and_apply_vocabulary
        features_dict[_transformed_name(feature)] = tft.compute_and_apply_vocabulary(data_col)

    for feature in _HASH_STRING_FEATURE_KEYS:
        data_col = inputs[feature] 
        # Transform by hashing strings into buckets
        # Hint: Use tft.hash_strings with the param hash_buckets set to 10
        features_dict[_transformed_name(feature)] = tft.hash_strings(data_col, hash_buckets = 10)
    
    features_dict[_LABEL_KEY] = inputs[_LABEL_KEY]

    return features_dict


In [None]:

transform = Transform(
    examples=example_gen.outputs['examples'],
    schema=user_schema_importer.outputs['result'],
    module_file=os.path.abspath(_cover_transform_module_file))
    
context.run(transform, enable_cache=False)
