In [None]:
# It's recommended to use more recent versions of TFX and KFP for better Vertex AI support.
# TFX 1.14.0+ or 1.15.0+ are good choices.
# These will pull compatible KFP SDK versions.
# Example:
# ! pip3 install tfx==1.15.0 --upgrade
# ! pip3 install google-cloud-aiplatform --upgrade # Good to have for Vertex AI interactions

# If you were using a specific pyparsing version due to older KFP/TFX,
# it might not be necessary with updated libraries.
# ! pip3 install pyparsing==2.4.2 # This might be removable or updated by TFX/KFP dependencies

In [1]:
import tensorflow as tf
# import tfx # TFX is primarily used via CLI in this notebook, but good to ensure it's importable
import os
# from google.cloud import aiplatform # Useful for programmatic interaction with Vertex AI

In [2]:
# Configuration for Google Cloud Vertex AI
GCP_REGION = 'us-central1'  # Your GCP region for Vertex AI Pipelines
ARTIFACT_STORE_URI = 'gs://text-analysis-323506-artifact-store'  # GCS path for pipeline artifacts
# ENDPOINT is not needed for Vertex AI; it's KFP-specific.
# ENDPOINT = 'https://1b7bb986cc987470-dot-us-central1.pipelines.googleusercontent.com'
CUSTOM_SERVICE_ACCOUNT = 'my-api-sa@text-analysis-323506.iam.gserviceaccount.com'  # SA for Vertex AI pipeline jobs

# Get PROJECT_ID from gcloud config
PROJECT_ID_BYTES = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID_BYTES[0]

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"GCP_REGION: {GCP_REGION}")
print(f"ARTIFACT_STORE_URI: {ARTIFACT_STORE_URI}")
print(f"CUSTOM_SERVICE_ACCOUNT: {CUSTOM_SERVICE_ACCOUNT}")

In [3]:
# Set environment variables for TFX and GCP
os.environ['GCP_REGION'] = GCP_REGION
os.environ['ARTIFACT_STORE_URI'] = ARTIFACT_STORE_URI  # Used by config.py for pipeline_root
os.environ['CUSTOM_SERVICE_ACCOUNT'] = CUSTOM_SERVICE_ACCOUNT  # Used by config.py and potentially by Vertex AI
os.environ['PROJECT_ID'] = PROJECT_ID

# Ensure Google Cloud authentication is set up for your environment
# e.g., by running `gcloud auth application-default login` in your terminal

In [4]:
# Pipeline and Model Configuration
PIPELINE_NAME = 'tfx_covertype_training_vertex'  # Consider a new name for the Vertex AI version
MODEL_NAME = 'tfx_covertype_classifier_vertex'
DATA_ROOT_URI = 'gs://text-analysis-323506/covertype'

# CRITICAL: This TFX image will be used by Vertex AI to run your pipeline components.
# It MUST be built with a TFX version compatible with:
# 1. The TFX version used for `tfx pipeline compile` (e.g., 1.15.0).
# 2. Google Cloud Vertex AI Pipelines.
# You are responsible for building and pushing this image to a registry (like GCR or Artifact Registry).
CUSTOM_TFX_IMAGE = f'gcr.io/{PROJECT_ID}/{PIPELINE_NAME}_image:latest'  # Example: Added _image and :latest tag

# These are for AI Platform Serving (if your pipeline's Pusher component uses it)
RUNTIME_VERSION = '2.15'  # Corresponds to TensorFlow version, e.g., TFX 1.15.0 uses TF 2.15
PYTHON_VERSION = '3.10'  # Python version for AI Platform Serving

# USE_KFP_SA: This flag was for Kubeflow Pipelines.
# For Vertex AI, the pipeline job runs with a specified service account.
# If your runner.py uses this to set `use_generic_launcher=True` in KubeflowDagRunnerConfig,
# that's generally good for Vertex AI compatibility.
USE_KFP_SA = True  # Recommended to be True for generic launcher

ENABLE_TUNING = False

print(f"CUSTOM_TFX_IMAGE: {CUSTOM_TFX_IMAGE}")

In [5]:
# Set environment variables for the pipeline script (config.py)
os.environ['PIPELINE_NAME'] = PIPELINE_NAME
os.environ['MODEL_NAME'] = MODEL_NAME
os.environ['DATA_ROOT_URI'] = DATA_ROOT_URI
os.environ['KUBEFLOW_TFX_IMAGE'] = CUSTOM_TFX_IMAGE  # config.py uses KUBEFLOW_TFX_IMAGE
os.environ['RUNTIME_VERSION'] = RUNTIME_VERSION
os.environ['PYTHON_VERSION'] = PYTHON_VERSION
os.environ['USE_KFP_SA'] = str(USE_KFP_SA)
os.environ['ENABLE_TUNING'] = str(ENABLE_TUNING)

In [6]:
# Navigate to the directory containing your pipeline definition (runner.py, pipeline.py, etc.)
%cd pipeline

/home/jupyter/TFX-Pipelines-on-Google-Cloud/pipeline


In [7]:
# Compile the TFX pipeline for Vertex AI
# The --engine 'vertex' flag tells TFX to prepare the pipeline for Vertex AI.
# The runner.py script should use KubeflowDagRunner, which is compatible.
! tfx pipeline compile --engine vertex --pipeline_path runner.py
# The output will be a .json file (e.g., tfx_covertype_training_vertex.json)

CLI
Compiling pipeline
running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying pipeline.py -> build/lib
copying runner.py -> build/lib
copying preprocessing.py -> build/lib
copying features.py -> build/lib
copying config.py -> build/lib
copying model.py -> build/lib
installing to /tmp/tmpczv6m_gz
running install
running install_lib
copying build/lib/preprocessing.py -> /tmp/tmpczv6m_gz
copying build/lib/features.py -> /tmp/tmpczv6m_gz
copying build/lib/pipeline.py -> /tmp/tmpczv6m_gz
copying build/lib/config.py -> /tmp/tmpczv6m_gz
copying build/lib/model.py -> /tmp/tmpczv6m_gz
copying build/lib/runner.py -> /tmp/tmpczv6m_gz
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Tran

In [9]:
# Create the pipeline in Vertex AI Pipelines
# TFX CLI will use PROJECT_ID and GCP_REGION from the environment or gcloud config.
# The compiled JSON file (e.g., {PIPELINE_NAME}.json) is used here.
# Ensure your CUSTOM_SERVICE_ACCOUNT has "Vertex AI User" and "Storage Object Admin" roles.
# Also, enable "Cloud AI Platform API" if not already enabled.
! tfx pipeline create --pipeline_path=runner.py --engine=vertex \
--build_image  # Optional: If you want TFX to help build and push the CUSTOM_TFX_IMAGE
# This requires Docker to be installed and configured.
# If you build your image manually, omit this flag.

# Note: If you don't use --build_image, ensure CUSTOM_TFX_IMAGE is already built and pushed.
# The `runner.py` and `config.py` should point to this pre-built image.

CLI
Creating pipeline
Detected Kubeflow.
Use --engine flag if you intend to use a different orchestrator.
running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying pipeline.py -> build/lib
copying runner.py -> build/lib
copying preprocessing.py -> build/lib
copying features.py -> build/lib
copying config.py -> build/lib
copying model.py -> build/lib
installing to /tmp/tmpv3faz_ji
running install
running install_lib
copying build/lib/preprocessing.py -> /tmp/tmpv3faz_ji
copying build/lib/features.py -> /tmp/tmpv3faz_ji
copying build/lib/pipeline.py -> /tmp/tmpv3faz_ji
copying build/lib/config.py -> /tmp/tmpv3faz_ji
copying build/lib/model.py -> /tmp/tmpv3faz_ji
copying build/lib/runner.py -> /tmp/tmpv3faz_ji
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_u

#### Pipeline runs can be created either through front-end UI or also thorugh tfx command

In [12]:
# Create a run of the pipeline on Vertex AI
# The TFX CLI interacts with Vertex AI using your GCP credentials.
! tfx run create --pipeline_name={PIPELINE_NAME} --engine=vertex

CLI
Creating a run for pipeline: tfx_covertype_training
Detected Kubeflow.
Use --engine flag if you intend to use a different orchestrator.
Run created for pipeline: tfx_covertype_training
| pipeline_name          | run_id                               | status | created_at                | link                                                                                                                         |
| tfx_covertype_training | 14058537-8361-4503-a5d7-4985dfabbbc7 | None   | 2021-12-18T18:02:05+00:00 | https://1b7bb986cc987470-dot-us-central1.pipelines.googleusercontent.com/#/runs/details/14058537-8361-4503-a5d7-4985dfabbbc7 |



In [15]:
# List runs for the pipeline on Vertex AI
! tfx run list --pipeline_name {PIPELINE_NAME} --engine=vertex

CLI
Listing all runs of pipeline: tfx_covertype_training
Detected Kubeflow.
Use --engine flag if you intend to use a different orchestrator.
| pipeline_name          | run_id                               | status  | created_at                | link                                                                                                                         |
| tfx_covertype_training | 1e40d136-8acf-4e80-8c46-3997ac1ee13f | Running | 2021-12-18T17:42:11+00:00 | https://1b7bb986cc987470-dot-us-central1.pipelines.googleusercontent.com/#/runs/details/1e40d136-8acf-4e80-8c46-3997ac1ee13f |
+------------------------+--------------------------------------+---------+---------------------------+------------------------------------------------------------------------------------------------------------------------------+
| tfx_covertype_training | 14058537-8361-4503-a5d7-4985dfabbbc7 | Failed  | 2021-12-18T18:02:05+00:00 | https://1b7bb986cc987470-dot-us-central1.pipelines.googleuserc

In [14]:
RUN_ID='1e40d136-8acf-4e80-8c46-3997ac1ee13f'

! tfx run status --pipeline_name {PIPELINE_NAME} --run_id {RUN_ID} --engine=vertex

CLI
Retrieving run status.
Detected Kubeflow.
Use --engine flag if you intend to use a different orchestrator.
| pipeline_name          | run_id                               | status  | created_at                | link                                                                                                                         |
| tfx_covertype_training | 1e40d136-8acf-4e80-8c46-3997ac1ee13f | Running | 2021-12-18T17:42:11+00:00 | https://1b7bb986cc987470-dot-us-central1.pipelines.googleusercontent.com/#/runs/details/1e40d136-8acf-4e80-8c46-3997ac1ee13f |



#### Creating a pipeline run with tuning

In [16]:
# Update environment variable for tuning
os.environ['ENABLE_TUNING'] = str(True)
print(f"ENABLE_TUNING set to: {os.environ['ENABLE_TUNING']}")

In [17]:
# Re-compile the pipeline with tuning enabled for Vertex AI
# This will update the pipeline definition JSON file.
! tfx pipeline compile --engine vertex --pipeline_path runner.py

CLI
Compiling pipeline
running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying pipeline.py -> build/lib
copying runner.py -> build/lib
copying preprocessing.py -> build/lib
copying features.py -> build/lib
copying config.py -> build/lib
copying model.py -> build/lib
installing to /tmp/tmpc7u67ar8
running install
running install_lib
copying build/lib/preprocessing.py -> /tmp/tmpc7u67ar8
copying build/lib/features.py -> /tmp/tmpc7u67ar8
copying build/lib/pipeline.py -> /tmp/tmpc7u67ar8
copying build/lib/config.py -> /tmp/tmpc7u67ar8
copying build/lib/model.py -> /tmp/tmpc7u67ar8
copying build/lib/runner.py -> /tmp/tmpc7u67ar8
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Tran

In [18]:
# Update the existing pipeline in Vertex AI with the new definition
! tfx pipeline update --pipeline_path runner.py --engine=vertex \
--build_image  # Again, optional, depending on your image build strategy.

CLI
Updating pipeline
Detected Kubeflow.
Use --engine flag if you intend to use a different orchestrator.
running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying pipeline.py -> build/lib
copying runner.py -> build/lib
copying preprocessing.py -> build/lib
copying features.py -> build/lib
copying config.py -> build/lib
copying model.py -> build/lib
installing to /tmp/tmpj8gpear9
running install
running install_lib
copying build/lib/preprocessing.py -> /tmp/tmpj8gpear9
copying build/lib/features.py -> /tmp/tmpj8gpear9
copying build/lib/pipeline.py -> /tmp/tmpj8gpear9
copying build/lib/config.py -> /tmp/tmpj8gpear9
copying build/lib/model.py -> /tmp/tmpj8gpear9
copying build/lib/runner.py -> /tmp/tmpj8gpear9
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_u

In [19]:
# Create a new run of the updated pipeline (with tuning enabled)
! tfx run create --pipeline_name={PIPELINE_NAME} --engine=vertex

CLI
Creating a run for pipeline: tfx_covertype_training
Detected Kubeflow.
Use --engine flag if you intend to use a different orchestrator.
Run created for pipeline: tfx_covertype_training
| pipeline_name          | run_id                               | status | created_at                | link                                                                                                                         |
| tfx_covertype_training | a6ffda91-4c63-4ea8-a1f4-34845d92c62d | None   | 2021-12-18T18:18:25+00:00 | https://1b7bb986cc987470-dot-us-central1.pipelines.googleusercontent.com/#/runs/details/a6ffda91-4c63-4ea8-a1f4-34845d92c62d |

