## TFX Pipelines

#### Generate Examples

In [7]:
# Temporary commands to unzip the zip file in Google Cloud Storage
# ! gsutil -m cp gs://text-analysis-323506/train_data/train_val.zip ./
# ! unzip train_val.zip
# ! gunzip *.csv.gz
# ! gsutil -m mv *.csv gs://text-analysis-323506/train_data/

In [None]:
# ! pip3 install tfx

In [35]:
import time
import os

import tfx

import absl
import os
import tempfile
import time

import tensorflow as tf
import tensorflow_data_validation as tfdv
import tensorflow_model_analysis as tfma
import tensorflow_transform as tft
import tensorflow as tf
from tfx.components.common_nodes.importer_node import ImporterNode

from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.proto import example_gen_pb2
from tensorflow_metadata.proto.v0 import schema_pb2, statistics_pb2, anomalies_pb2
from tfx.components import StatisticsGen, CsvExampleGen, SchemaGen

In [4]:
tf.__version__

'2.6.2'

In [5]:
tfx.__version__

'1.4.0'

In [33]:
DATA_ROOT = 'gs://text-analysis-323506/train_data/'
ARTIFACT_STORE = os.path.join(os.sep, 'home', 'jupyter', 'artifact-store')
SERVING_MODEL_DIR=os.path.join(os.sep, 'home', 'jupyter', 'serving_model')

In [7]:
PIPELINE_NAME = 'sentiment-analysis'
PIPELINE_ROOT = os.path.join('/home/jupyter/', PIPELINE_NAME, time.strftime("%Y%m%d_%H%M%S"))
os.makedirs(PIPELINE_ROOT, exist_ok=True)

In [8]:
context = InteractiveContext(
    pipeline_name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
    metadata_connection_config=None)



### CSV Example Generator

In [9]:
output_config = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(splits=[        
        example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=4),
        example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
    ]))

In [10]:
example_gen = CsvExampleGen(
    input_base=DATA_ROOT,
    output_config=output_config)

In [None]:
context.run(example_gen)

In [12]:
examples_uri = example_gen.outputs['examples'].get()[0].uri

In [13]:
tfrecord_filenames = [os.path.join(examples_uri, 'Split-train', name)
                      for name in os.listdir(os.path.join(examples_uri, 'Split-train'))]

In [14]:
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

2021-11-23 17:56:20.085763: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [15]:
for tfrecord in dataset.take(2):
    example = tf.train.Example()
    example.ParseFromString(tfrecord.numpy())
    for name, feature in example.features.feature.items():
        if feature.HasField('bytes_list'):
            value = feature.bytes_list.value
        if feature.HasField('float_list'):
            value = feature.float_list.value
        if feature.HasField('int64_list'):
            value = feature.int64_list.value
        print('{}: {}'.format(name, value))
    print('******')

input: [b'myattorney home business looking account prose litigant state federal lawsuite maintain hisher files']
labels: [1]
******
input: [b'great book liked book sarah plain tall authors name patricia maclachen characters names caleb sarah anna papa though book interestingit first started caleb siting fire asking questions mama singing songs anna explaining born mama died sarah answered papas letter came live sarah taught caleb swim anybody gets book really enjoy']
labels: [0]
******


2021-11-23 17:56:20.179374: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


#### Train and eval datasets have been created properly !

### Statistics Generator

In [16]:
statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

In [None]:
context.run(statistics_gen)

In [None]:
context.show(statistics_gen.outputs['statistics'])

### Schema Generator

In [19]:
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    infer_feature_shape=False)

In [None]:
context.run(schema_gen)

In [28]:
context.show(schema_gen.outputs['schema'])

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'input',BYTES,required,single,-
'labels',INT,required,single,-


In [29]:
schema_proto_path = '{}/{}'.format(schema_gen.outputs['schema'].get()[0].uri, 'schema.pbtxt')
schema = tfdv.load_schema_text(schema_proto_path)

In [31]:
tfdv.set_domain(schema, 'labels', schema_pb2.IntDomain(name='labels', min=0, max=1, is_categorical=True))



In [32]:
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'input',BYTES,required,single,-
'labels',INT,required,single,min: 0; max: 1


#### Write schema to new file

In [None]:
schema_dir = os.path.join(ARTIFACT_STORE, 'schema')
tf.io.gfile.makedirs(schema_dir)
schema_file = os.path.join(schema_dir, 'schema.pbtxt')

tfdv.write_schema_text(schema, schema_file)

!cat {schema_file}

### Schema Importer

In [43]:
schema_importer = tfx.dsl.components.common.importer.Importer(
      source_uri=schema_dir,
      artifact_type=tfx.types.standard_artifacts.Schema).with_id(
          'schema_importer') 

In [None]:
context.run(schema_importer)

In [45]:
context.show(schema_importer.outputs['result'])

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'input',BYTES,required,single,-
'labels',INT,required,single,min: 0; max: 1
