# Object Detection - Online Prediction - Kubeflow Pipeline Test

In [2]:
!gcloud config list

[ai]
region = us-central1
[compute]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


**It is necessary to give some permissions to the service account first:**

*Failed to create pipeline job. Error: Service account `136021895401-compute@developer.gserviceaccount.com` does not have `[storage.objects.get, storage.objects.create]` IAM permission(s) to the bucket "qwiklabs-gcp-00-373ac55d0e0a". Please either copy the files to the Google Cloud Storage bucket owned by your project, or grant the required IAM permission(s) to the service account..*

In [3]:
import os

#!pip install --upgrade kfp
import kfp
from google.cloud import aiplatform
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google import experimental
from kfp.v2.google.client import AIPlatformClient

print('Kubeflow pipelines version: {}'.format(kfp.__version__))

Kubeflow pipelines version: 1.7.1


In [4]:
PROJECT='qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'
BUCKET='qwiklabs-gcp-00-373ac55d0e0a'
BQ_DATASET='demo_dataset'
OBJDET_TABLE='objdet'
PIPELINE_ROOT = f"gs://{BUCKET}/labeled_patents/pipeline_root"


MODEL_DISPLAY_NAME=f"labpat_model"
MACHINE_TYPE="n1-standard-16"
REPLICA_COUNT=1
DOCKER_IMAGE_URI_CREATE_BQDATASET="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-3:latest"
# Pre-built containers:
# https://cloud.google.com/vertex-ai/docs/training/pre-built-containers


# Output directory and job_name
#OUTDIR=f"gs://{BUCKET}/taxifare/trained_model_{TIMESTAMP}"
#MODEL_DISPLAY_NAME=f"taxifare_{TIMESTAMP}"

#PYTHON_PACKAGE_URIS=f"gs://{BUCKET}/taxifare/taxifare_trainer-0.1.tar.gz"
#SERVING_CONTAINER_IMAGE_URI="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest"
#PYTHON_MODULE="trainer.task"

# Model and training hyperparameters
#BATCH_SIZE=500
#NUM_EXAMPLES_TO_TRAIN_ON=10000
#NUM_EVALS=1000
#NBUCKETS=10
#LR=0.001
#NNSIZE="32 8"

# GCS paths
#GCS_PROJECT_PATH=f"gs://{BUCKET}/taxifare"
#DATA_PATH=f"{GCS_PROJECT_PATH}/data"
#TRAIN_DATA_PATH=f"{DATA_PATH}/taxi-train*"
#EVAL_DATA_PATH=f"{DATA_PATH}/taxi-valid*"

os.environ["PROJECT"] = PROJECT

### Component 1: Create the BQ dataset and OBJ results table

In [5]:
@component(packages_to_install=['google-cloud-bigquery'])
def create_bigquery_demo_dataset(project: str,
                                 dataset_name: str,
                                 table_name: str):
  
    from google.cloud import bigquery   


    client = bigquery.Client()

    # Create dataset
    dataset_id = f'{project}.{dataset_name}'
    dataset = bigquery.Dataset(dataset_id)
    dataset.location = "US"
    dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
    #print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

    # Create table
    OBJDET_SCHEMA = [
        bigquery.SchemaField('file_name',   'STRING', mode='NULLABLE'),
        bigquery.SchemaField('objdet_pred', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('objdet_confidence', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('objdet_xmin', 'FLOAT', mode='NULLABLE'),
        bigquery.SchemaField('objdet_xmax', 'FLOAT', mode='NULLABLE'),
        bigquery.SchemaField('objdet_ymin', 'FLOAT', mode='NULLABLE'),
        bigquery.SchemaField('objdet_ymax', 'FLOAT', mode='NULLABLE')]

    table_id = f'{project}.{dataset_name}.{table_name}'

    schema = OBJDET_SCHEMA

    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table)  # Make an API request.
    #print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))

### Component 2: Performing object detection:

In [6]:
@component(packages_to_install=['google-cloud-bigquery', 'google-cloud-storage',  'google-cloud-aiplatform'])
def perform_object_detection(project: str,
                             region: str,
                             bucket: str,
                             img_blob: str,
                             objdet_endpoint: str,
                             dataset_name: str,
                             table_name: str):
    
    # IMPORTS     
    import os
    import tempfile
    import logging
    
    import traceback as tb

    from google.cloud import bigquery
    from google.cloud import storage
    from fnmatch import fnmatch

    import base64

    from google.cloud import aiplatform
    from google.cloud.aiplatform.gapic.schema import predict
    
    
    #_____________________________________ AUXILIARY FUNCTIONS ______________________________
    def get_bucket_file_list(bucket_name, fname_template='*'):
        '''!@brief Function that returns the list of files in a bucket.
        @param bucket (string) Bucket name.
        @param fname_template (string) Template for filtering blob names 
        that supports Unix shell-style wildcards. For more info: 
        https://docs.python.org/3/library/fnmatch.html

        @return (list of srtings) List of blob names in a bucket which 
        fullfills template structure.
        '''
        # Instantiating client:
        storage_client = storage.Client()

        # Note: Client.list_blobs requires at least package version 1.17.0.
        blobs = storage_client.list_blobs(bucket_name)

        # Listing all the blobs in a bucket:
        blob_lst = [blob.name for blob in blobs]

        # Filtering blob names with the template format given:  
        file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]

        return file_lst
    
    
    def predict_image_classification_sample(
        project: str,
        endpoint_id: str,
        filename: str,
        location: str = "us-central1",
        api_endpoint: str = "us-central1-aiplatform.googleapis.com"):
        
        # The AI Platform services require regional API endpoints.
        client_options = {"api_endpoint": api_endpoint}
        # Initialize client that will be used to create and send requests.
        # This client only needs to be created once, and can be reused for multiple requests.
        client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
        with open(filename, "rb") as f:
            file_content = f.read()

        # The format of each instance should conform to the deployed model's prediction input schema.
        encoded_content = base64.b64encode(file_content).decode("utf-8")
        instance = predict.instance.ImageObjectDetectionPredictionInstance(
            content=encoded_content,
        ).to_value()
        instances = [instance]
        parameters = predict.params.ImageObjectDetectionPredictionParams(
            confidence_threshold=0.5, max_predictions=5,
        ).to_value()
        endpoint = client.endpoint_path(
            project=project, location=location, endpoint=endpoint_id
        )
        response = client.predict(
            endpoint=endpoint, instances=instances, parameters=parameters
        )
        predictions = response.predictions
        return [dict(prediction) for prediction in predictions]
    #___________________________________________ MAIN _______________________________________

    # Instantiating BQ client:
    client = bigquery.Client()
    
    files = get_bucket_file_list(bucket_name=f'{bucket}',
                                 fname_template=img_blob)
    
    for file in files:             
        # Downloading the file as a temporal file:
        storage_client = storage.Client()
        bucket = storage_client.bucket(project)
        blob = bucket.blob(file)
        _, path = tempfile.mkstemp()
        blob.download_to_filename(path + '.png')    

        # Obtaining online prediction:
        preds = predict_image_classification_sample(
            project=project,
            endpoint_id=objdet_endpoint,
            filename=f'{path}.png',
            location=region,
            api_endpoint='us-central1-aiplatform.googleapis.com')

        # Parsing prediction:
        objdet_pred = preds[0]['displayNames'][0]
        objdet_confidence = preds[0]['confidences'][0]
        objdet_xmin, objdet_xmax = preds[0]['bboxes'][0][0], preds[0]['bboxes'][0][1]
        objdet_ymin, objdet_ymax = preds[0]['bboxes'][0][2], preds[0]['bboxes'][0][3]
        
        # Storing prediction into the BQ table:
        rows_to_insert = [
            {'file_name': f'{file}'.split('/')[-1],
             'objdet_pred': f'{objdet_pred}',
             'objdet_confidence': f'{objdet_confidence}',
             'objdet_xmin': f'{objdet_xmin}',
             'objdet_xmax': f'{objdet_xmax}',
             'objdet_ymin': f'{objdet_ymin}',
             'objdet_ymax': f'{objdet_ymax}'}
        ]

        table_id = f'{project}.{dataset_name}.{table_name}'

        errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
        if errors == []:
            print("New rows have been added.")
        else:
            print("Encountered errors while inserting rows: {}".format(errors))
        os.remove(f'{path}.png')
        os.remove(path)            

### Creating the Pipeline: Component 1 + Component 2

In [7]:
@kfp.dsl.pipeline(name="labpat-pipeline-test2", 
                  description='An example pipeline that performs addition calculations.',
                  pipeline_root=PIPELINE_ROOT)

def pipeline():

    create_bqdataset_task = create_bigquery_demo_dataset(project='qwiklabs-gcp-00-373ac55d0e0a',
                                                         dataset_name='demo_dataset_pipeline',
                                                         table_name='objdet_results')
    
    
    object_detection_task = perform_object_detection(project='qwiklabs-gcp-00-373ac55d0e0a',
                                                     region='us-central1',
                                                     bucket='qwiklabs-gcp-00-373ac55d0e0a',
                                                     img_blob='labeled_patents/subsample_images/*',
                                                     objdet_endpoint='2074030773706424320',
                                                     dataset_name='demo_dataset_pipeline',
                                                     table_name='objdet_results')

    
    object_detection_task.after(create_bqdataset_task)
    
    
#     experimental.run_as_aiplatform_custom_job(
#         create_bqdataset_task,
#         display_name=f"labpat_pipeline-create_bq_dataset",
#         worker_pool_specs=[
#             {   "containerSpec": {
#                     "imageUri": f"{DOCKER_IMAGE_URI_CREATE_BQDATASET}",
#                 },
#                 "replica_count": f"{REPLICA_COUNT}",
#                 "machineSpec": {
#                     "machineType": f"{MACHINE_TYPE}",
#                 },                
#             }
#         ],
#     )
    
# To define a custom job, worker_pool_spec is mandator. To define a worker_pool_spec 
# is also needed either a containerSpec or a pythonPackageSpec.
# https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec    


### Compiling the Pipeline

In [8]:
from kfp.v2 import compiler

if not os.path.isdir("vertex_pipelines"):
    os.mkdir("vertex_pipelines")

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path="./vertex_pipelines/labeled_patents_pipeline2.json"
)

### Launching the Pipeline

In [9]:
# Instantiating an API client object:
# TODO: use the new Vertex AI.
api_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)



In [10]:
response = api_client.create_run_from_job_spec(
    './vertex_pipelines/labeled_patents_pipeline2.json',
    pipeline_root=f"{PIPELINE_ROOT}",
)