# Download Pipelines (0.2.5)

We start by downloading a specific release of Kubeflow components

In [None]:
!wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz

In [None]:
!tar -xvf 0.2.5.tar.gz

In [None]:
import kfp

In [None]:
gcs_download_component = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/google-cloud/storage/download/component.yaml")
tfx_csv_gen = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml")
tfx_statistic_gen = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml")
tfx_schema_gen = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/SchemaGen/component.yaml")
tfx_example_validator = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml")

In [None]:
@kfp.dsl.pipeline(
  name='DL',
  description='Sample DL pipeline'
)
def pipeline_with_dl():
    dl_op = gcs_download_component(gcs_path="gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv")

In [None]:
kfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip')

In [None]:
client = kfp.Client()

In [None]:
my_experiment = client.create_experiment(name='dl')
my_run = client.run_pipeline(my_experiment.id, 'dl', 'dl_pipeline.zip')

In [None]:
@kfp.dsl.pipeline(
  name='TFDV',
  description='TF DV Pipeline'
)
def tfdv_pipeline():
    fetch = kfp.dsl.ContainerOp(
      name='download',
      image='busybox',
      command=['sh', '-c'],
      arguments=[
          'sleep 1;'
          'mkdir -p /tmp/data;'
          'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'],
      file_outputs={'downloaded': '/tmp/data'})
    records_example = tfx_csv_gen(input_base=fetch.output)
    stats = tfx_statistic_gen(input_data=records_example.output)
    schema_op = tfx_schema_gen(stats.output)
    tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])

In [None]:
kfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip')

In [None]:
my_experiment = client.create_experiment(name='tfdv_pipeline')
my_run = client.run_pipeline(my_experiment.id, 'tfdv', 'tfdv_pipeline.zip')

In [None]:
!pip install tfx tensorflow-data-validation

In [None]:
import tensorflow_data_validation as tfdv

You can download your schema by looking at the inputs/outputs in your pipeline run for the schemagen stage.

For your convenience this is also included in the config folder.

In [None]:
schema = tfdv.load_schema_text("config/schema_info")
tfdv.display_schema(schema)

In [None]:
tfx_transform = kfp.components.load_component_from_file("pipelines-0.2.5/components/tfx/Transform/component.yaml")

In [None]:
import os
from minio import Minio

minio_endpoint = os.environ.get('MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000')
minio_key = os.environ.get('MINIO_KEY', 'minio')
minio_secret = os.environ.get('MINIO_SECRET', 'XXXXXX')
minioClient = Minio(minio_endpoint,
                    access_key=minio_key,
                    secret_key=minio_secret,
                    secure=False)

print('Minio parameters : URL ', minio_endpoint, ' key ', minio_key, ' secret ', minio_secret)

os.environ['AWS_ACCESS_KEY_ID'] = minio_key
os.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret
os.environ['AWS_REGION'] = 'us-west-1'
os.environ['S3_REGION'] = 'us-west-1'
os.environ['S3_ENDPOINT'] = minio_endpoint
os.environ['S3_USE_HTTPS'] = '0'
os.environ['S3_VERIFY_SSL'] = '0'

module_file="s3://data/test.py"

In [None]:
import yaml
import kfp
from kfp import components
from kfp import dsl
from kubernetes import client as k8s_client 

@kfp.dsl.pipeline(
  name='TFX',
  description='TFX pipeline'
)
def tfx_pipeline():
    fetch = kfp.dsl.ContainerOp(
      name='download',
      image='busybox',
      command=['sh', '-c'],
      arguments=[
          'sleep 1;'
          'mkdir -p /tmp/data;'
          'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'],
      file_outputs={'downloaded': '/tmp/data'})
    records_example = tfx_csv_gen(input_base=fetch.output)
    stats = tfx_statistic_gen(input_data=records_example.output)
    schema_op = tfx_schema_gen(stats.output)
    tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])
    transformed_output = tfx_transform(
        input_data=records_example.output,
        schema=schema_op.outputs['output'],
        module_file=module_file) # Path to your TFT code on GCS/S3
    dsl.get_pipeline_conf().set_image_pull_secrets([k8s_client.V1ObjectReference(name="k8scc01covidacr-registry-connection")])
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='AWS_ACCESS_KEY_ID', value=os.environ['AWS_ACCESS_KEY_ID'])))
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value=os.environ['AWS_SECRET_ACCESS_KEY'])))
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='AWS_REGION', value=os.environ['AWS_REGION'])))
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='S3_REGION', value=os.environ['S3_REGION'])))
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='S3_ENDPOINT', value=os.environ['S3_ENDPOINT'])))
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='S3_USE_HTTPS', value=os.environ['S3_USE_HTTPS'])))
    dsl.get_pipeline_conf().add_op_transformer(lambda cop: cop.container.add_env_variable(k8s_client.V1EnvVar(name='S3_VERIFY_SSL', value=os.environ['S3_VERIFY_SSL'])))

In [None]:
kfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip')

In [None]:
my_experiment = client.create_experiment(name='tfx_pipeline')
my_run = client.run_pipeline(my_experiment.id, 'tfx', 'tfx_pipeline.zip')