# Prepare pdfs for later in pipeline (Obj Det, Img, text, NER)

- user provides
    - Google Cloud project (input)
    - bucket in GCS of pdfs (input)
    - BQ dataset to write prediction results (output)
        - BQ table: aggregated results (pdf_name, icn_pred, objdet_pred(coords), text_cn, ner1, ner2, ...., ner)
            created with JOIN on pdf_name
        - BQ table: icn_preds (pdf_name, icn_pred)    --> this table is made in icn_predict.ipynb
        - BQ table: objdet_pred (pdf_name, objdet_pred(coords)) --> this table is made in objdet_predict.ipynb
        - BQ table: text_cn (pdf_name, text_cn)    --> this table is made in text_cn_predict.ipynb
        - BQ table: ner (pdf_name, ner1, ner2, ...., ner)
        
- see utils.py for utils functions
        

Steps: 
 1. convert pdf to png and write to bucket (for ICN, ObjDet)
 2. do ocr on pdf and write to bucket 
 3. create dataset 
    

In [515]:
PROJECT = !gcloud config get-value project # returns SList
PROJECT = PROJECT[0] # gets first element in list -> str
REGION = "us-central1"  
MODEL_RESOURCE_NAME = "2393478483993952256"

import os
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION

In [516]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

In [517]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [518]:
from google.cloud import storage
from google.cloud import vision
from google.cloud import aiplatform
import tempfile



from importlib import reload
from pathlib import Path
import pandas as pd
import numpy as np

# for jupyter only
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [519]:
from pdf2image import convert_from_path
import io
import base64
import cv2
from datetime import datetime
import time
import json

In [520]:
logging.info("test if logging works")

05:14:48 INFO:test if logging works


# Labeled Patents - Vertex AI Pipeline

## Importing Auxiliary Libraries

In [16]:
import os
from datetime import datetime

from typing import NamedTuple

#!pip install --upgrade kfp
import kfp
from google.cloud import aiplatform
from kfp.v2 import compiler
from kfp.v2.dsl import component, Input, Artifact
from kfp.v2.google import experimental
from kfp.v2.google.client import AIPlatformClient

print('Kubeflow pipelines version: {}'.format(kfp.__version__))

Kubeflow pipelines version: 1.7.1


## Setting Notebook Inputs

In [2]:
UUID = datetime.now().strftime('%y%m%d_%H%M%S') #str
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'

BUCKET = 'patents_pipetest'
PDF_BUCKET_PATH = 'pdf'

RES_DATASET_NAME = 'docprocessing_' + UUID
RES_DATASET_ID = f'{PROJECT}.{RES_DATASET_NAME}'

TCN_MODEL_NAME = '2393478483993952256'
TCN_RESTABLE_NAME = f'{RES_DATASET_ID}.tcn'
TCN_RESTABLE_SCHEMA = """
[
 {"name": "file", "field_type": "STRING", "mode": "REQUIRED", "description": "File path."},
 {"name": "subject", "field_type": "STRING", "mode": "REQUIRED", "description": "Predicted class."},
 {"name": "score", "field_type": "STRING", "mode": "REQUIRED", "description": "Confidence of the prediction."}
]
"""

ICN_MODEL_NAME = '8925034949820547072'
ICN_ENDPT_NAME = ''
ICN_RESTABLE_NAME = f'{RES_DATASET_ID}.icn'
ICN_RESTABLE_SCHEMA = """
[
 {"name":  "file", "field_type": "STRING", "mode": "REQUIRED", "description": "File path."},
 {"name": "label", "field_type": "STRING", "mode": "REQUIRED", "description": "Predicted class."},
 {"name": "score", "field_type": "STRING", "mode": "REQUIRED", "description": "Confidence of the prediction."}
]
"""

ODM_MODEL_NAME = '3409814256151953408'
ODM_ENDPT_NAME = '2074030773706424320'
ODM_RESTABLE_NAME = f'{RES_DATASET_ID}.odm'
ODM_RESTABLE_SCHEMA = """
[
 {"name": "file",  "field_type": "STRING", "mode": "REQUIRED", "description": "File path."},
 {"name": "label", "field_type": "STRING", "mode": "REQUIRED", "description": "Predicted class."},
 {"name": "score", "field_type":  "FLOAT", "mode": "REQUIRED", "description": "Confidence of the prediction."},
 {"name": "xmin",  "field_type":  "FLOAT", "mode": "REQUIRED", "description": "X coordinate of the top left corner."},
 {"name": "xmax",  "field_type":  "FLOAT", "mode": "REQUIRED", "description": "Y coordinate of the top left corner."},
 {"name": "ymin",  "field_type":  "FLOAT", "mode": "REQUIRED", "description": "X coordinate of the bottom right corner."},
 {"name": "ymax",  "field_type":  "FLOAT", "mode": "REQUIRED", "description": "Y coordinate of the bottom right corner."}
]
"""

src_path = "gs://2021_08_16_tcn_dev"
dst_path = "gs://2021_08_16_tcn_dev"



PIPELINE_NAME = 'process-pdf-patents-nina'
PIPELINE_ROOT = f"gs://{BUCKET}/labeled_patents/pipeline_root"
LOCAL_PIPELINE_PATH = './vertex_pipelines'
LOCAL_PIPELINE_JSON = os.path.join(LOCAL_PIPELINE_PATH, 'labeled_patents_pipeline2.json')

RESULTS_BQ_DATASET='demo_dataset'
RESULTS_OBJDET_TABLE='objdet'



MODEL_DISPLAY_NAME=f"labpat_model"
MACHINE_TYPE="n1-standard-16"
REPLICA_COUNT=1
DOCKER_IMAGE_URI_CREATE_BQDATASET="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-3:latest"


os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET 
os.environ['PDF_BUCKET_PATH'] = PDF_BUCKET_PATH

**Copying some demo files into the Bucket**

In [525]:
# !gsutil -m cp gs://2021_08_16_tcn_dev/*.pdf gs://$BUCKET/$PDF_BUCKET_PATH

# Create new Docker image

In [526]:
# !gcloud config set account student-04-1e37ebc5f596@qwiklabs.net

Updated property [core/account].


In [527]:
# !gcloud auth login

In [528]:
# !cat base_image/Dockerfile

In [529]:
# BASE_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT, IMAGE_NAME, TAG)
# print(BASE_IMAGE)

In [530]:
# IMAGE_NAME='pdf_to_png_image'
# TAG='latest'
# BASE_IMAGE='gcr.io/qwiklabs-gcp-00-373ac55d0e0a/pdf_to_png_image:latest'
# gcloud builds submit --timeout 15m --tag $BASE_IMAGE base_image

## Defining Vertex AI Components

### Component 1: Performing OCR on PDFs

In [3]:
@component(packages_to_install=['google-cloud-storage',  'google-cloud-vision'])
def perform_ocr_on_pdfs(src_path: str, 
                        dst_path: str,
                        uuid: str,
                        project: str):
    
    # IMPORTS:
    import os
    import logging
    import traceback as tb
    import time
    import json
    from pathlib import Path
    from google.cloud import storage
    from google.cloud import vision
    # from google.cloud import aiplatform

    
    # AUXILIARY FUNCTIONS:
    def to_trace_str(e):
        return ''.join(tb.format_exception(None, e, e.__traceback__))   
    
    
    def dismantle_path(gcs_path):
        parts = Path(gcs_path).parts
        bucket_idx = 1 if parts[0].startswith("gs") else 0
        filename_idx = -1 if "." in parts[-1] else None

        bucket_name = parts[bucket_idx]
        filename = parts[filename_idx] if filename_idx else ""
        directory = "/".join(parts[bucket_idx:filename_idx] if filename_idx else parts[bucket_idx+1:])
        return bucket_name, directory, filename
    
    
    def ocr(src_path, dst_path, project):
        """Perform optical character recognition in pdf files.
        
        Args
            src_path
            dst_path
        
        Returns
            google.api_core.operation.Operation
            To check if done use method .done()
            
        Link to documentation:  
            https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
            https://cloud.google.com/vision/docs/pdf
        
        """
        try:
            logging.info("started optical character recognition")
            
            src_bucket_name, src_directory, _ = dismantle_path(src_path)
            dst_bucket_name, dst_directory, _ = dismantle_path(dst_path)
            
            storage_client = storage.Client(project=project)
            src_bucket = storage_client.bucket(src_bucket_name)
            dst_bucket = storage_client.bucket(dst_bucket_name)
            
            logging.info(f"src_bucket_name {src_bucket_name}, src_directory {src_directory}")

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]
            
            logging.info(f"found {len(blob_list)} pdf files in bucket {src_bucket_name}")

            client = vision.ImageAnnotatorClient()
            feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
            
            operations = []
            async_requests = []
            
            for b_idx, blob in enumerate(blob_list):
                # start ocr with gcs_source_uri patents_pipetest/pdf/us_076.pdf, and gcs_destination_uri patents_pipetest/pdf/us_076.pdf
                gcs_source_uri = "gs://" +  os.path.join(src_bucket_name, blob.name)
                gcs_destination_uri = "gs://" + os.path.join(dst_bucket_name, blob.name)
                
                
                
                logging.info(f"start ocr with gcs_source_uri {gcs_source_uri}, and gcs_destination_uri {gcs_destination_uri}")

                # source
                gcs_source = vision.GcsSource(uri=gcs_source_uri)
                input_config = vision.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

                # destination
                gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
                output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=1)

                logging.info(f"started ocr for {b_idx} of {len(blob_list)} files")
                async_request = vision.AsyncAnnotateFileRequest(
                    features=[feature], 
                    input_config=input_config,
                    output_config=output_config
                )
                async_requests.append(async_request)

            operation = client.async_batch_annotate_files(requests=async_requests)
            return operation
            
        except Exception as e:
            logging.error(f"Error in method ocr: {to_trace_str(e)}")
            
            
    def create_text_files(gcs_path, project):
        try:
            # init bucket
            bucket_name, directory, _ = dismantle_path(gcs_path)
            storage_client = storage.Client(project=project)
            bucket = storage_client.bucket(bucket_name)
            blob_list = [blob for blob in list(bucket.list_blobs()) if \
                             os.path.basename(directory) in blob.name and \
                             blob.name.endswith("output-1-to-1.json")]
            
            for b_idx, blob in enumerate(blob_list):
                logging.info(f"creating {b_idx+1} of {len(blob_list)} text files")
                json_string = blob.download_as_string()
                response = json.loads(json_string)
                text = response['responses'][0]['fullTextAnnotation']['text'] 
                txt_path = blob.name.replace("output-1-to-1.json", ".txt")
                text_blob = bucket.blob(txt_path)
                logging.info(f"uploaded {b_idx+1} of {len(blob_list)} text files. Path: gs://{bucket_name}/{txt_path}")
                text_blob.upload_from_string(text)
                
            logging.info("finished creating text files")
            
        except Exception as e:
            logging.error(f"Error in method create_text_files: {to_trace_str(e)}") 
            
    def get_extension(mime_type):
        if mime_type == "text/plain":
            return ".txt"
        elif mime_type == "image/png":
            return ".png"
        else:
            return ".txt"
    
    def create_jsonl(gcs_path, mime_type, filename, project):
        """create jsonl out of files in bucket
        
        Args
            gcs_path (str): bucket or dir where files are located
            mime_type (str): the files mimetype 
            filename (str): the jsonl filename
        
        Returns
            full path of jsonl
        """
        try:
            bucket_name, directory, _ = dismantle_path(gcs_path)
            storage_client = storage.Client(project=project)
            bucket = storage_client.bucket(bucket_name)
            extension = get_extension(mime_type)

            blob_list = [blob for blob in list(bucket.list_blobs()) if \
                             os.path.basename(directory) in blob.name and \
                             blob.name.endswith(extension)]

            jsonl_content = ""

            for b_idx, blob in enumerate(blob_list):
                full_path = os.path.join(gcs_path,blob.name)

                d = json.dumps(
                    {
                    "content": full_path,
                    "mimeType": mime_type
                    }
                )+"\n"

                jsonl_content = jsonl_content+d


            file_path = os.path.join(directory, filename)
            bucket.blob(file_path).upload_from_string(jsonl_content)
            logging.info(f"uploaded jsonl {file_path} to bucket {bucket_name}. Full path: gs://{os.path.join(bucket_name,file_path)}")

        
        except Exception as e:
            logging.error(f"Error in jsonl creation: {to_trace_str(e)}")
    
    
    
    
    
    # PIPELINE COMPONENT MAIN CODE:
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the processing of pdfs with the OCR functionality of Google Vision API.")
    
        
    # save everything in the same bucket
    jsonl_filename_tcn = f"tcn_{uuid}.jsonl"
        
    # create ocr
    ocr_operation = ocr(src_path, dst_path, project)
        
    while not ocr_operation.done():
        logging.info("wait for ocr to finish")
        time.sleep(5)

    create_text_files(src_path, project)
    create_jsonl(gcs_path=dst_path, mime_type="text/plain", filename=jsonl_filename_tcn, project=project)
    
    # return path where jsonl with .txt files is saved


### Component 2: PDF to PNG conversion

In [24]:
@component(
    base_image="gcr.io/qwiklabs-gcp-00-373ac55d0e0a/pdf_to_png_image:latest",
    packages_to_install=['google-cloud-storage', 'pdf2image', 'opencv-contrib-python', 'numpy']
)
def transform_pdfs_into_png(src_path:str, 
                            dst_path:str,
                            uuid: str,
                            project:str):
    
    import os
    import logging
    import traceback as tb
    import time
    from pdf2image import convert_from_path
    import io
    import base64
    import cv2
    import tempfile
    import numpy as np
    
    from pathlib import Path
    from google.cloud import storage
    
    
    
    def to_trace_str(e):
        return ''.join(tb.format_exception(None, e, e.__traceback__))   
    
    def get_extension(mime_type):
        if mime_type == "text/plain":
            return ".txt"
        elif mime_type == "image/png":
            return ".png"
        else:
            return ".txt"
    
    def dismantle_path(gcs_path):
        parts = Path(gcs_path).parts
        bucket_idx = 1 if parts[0].startswith("gs") else 0
        filename_idx = -1 if "." in parts[-1] else None

        bucket_name = parts[bucket_idx]
        filename = parts[filename_idx] if filename_idx else ""
        directory = "/".join(parts[bucket_idx:filename_idx] if filename_idx else parts[bucket_idx+1:])
        return bucket_name, directory, filename
    
    def convert_pdf_to_png(src_path, dst_path, project):
        """Takes pdfs from src_bucket_name and transforms them into png. Then it saves the result in dst_bucket_name"""
        try:
            logging.info("started conversion pdf -> png")
            
            storage_client = storage.Client(project=project)
        
            src_bucket_name, src_directory, _ = dismantle_path(src_path)
            dst_bucket_name, dst_directory, _ = dismantle_path(dst_path)
            
            src_bucket = storage_client.bucket(src_bucket_name)
            dst_bucket = storage_client.bucket(dst_bucket_name)

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]

            encoded_img_lst = []
            imgs = []
            logging.info(f"found {len(blob_list)} pdfs in bucket  {src_bucket_name}")

            for b_idx, blob in enumerate(blob_list):
                _, tmp_pdf = tempfile.mkstemp()
                blob.download_to_filename(tmp_pdf)
                logging.info(f"downloaded {b_idx+1} of {len(blob_list)} files")
                image = convert_from_path(tmp_pdf)
                logging.info(f"converted {b_idx+1} of {len(blob_list)} images")
                image = image[0]                # Only the firs page is going to be analyzed.
                image = np.array(image)
                is_success, im_buf_arr = cv2.imencode(".png", image)
                byte_im = im_buf_arr.tobytes()
                filename = os.path.join(dst_directory, os.path.basename(blob.name)+".png")
                
                dst_bucket.blob(filename).upload_from_string(byte_im)
                logging.info(f"uploading png {b_idx+1} of {len(blob_list)} to gs://{dst_bucket_name}/{filename}")
        
        except Exception as e:
            logging.error(f"Error in method convert_pdf_to_png: {to_trace_str(e)}")
            return False
        
    def create_jsonl(gcs_path, mime_type, filename, project):
        """create jsonl out of files in bucket
        
        Args
            gcs_path (str): bucket or dir where files are located
            mime_type (str): the files mimetype 
            filename (str): the jsonl filename
        
        Returns
            full path of jsonl
        """
        try:
            bucket_name, directory, _ = dismantle_path(gcs_path)
            storage_client = storage.Client(project=project)
            bucket = storage_client.bucket(bucket_name)
            extension = get_extension(mime_type)

            blob_list = [blob for blob in list(bucket.list_blobs()) if \
                             os.path.basename(directory) in blob.name and \
                             blob.name.endswith(extension)]

            jsonl_content = ""

            for b_idx, blob in enumerate(blob_list):
                full_path = os.path.join(gcs_path,blob.name)

                d = json.dumps(
                    {
                    "content": full_path,
                    "mimeType": mime_type
                    }
                )+"\n"

                jsonl_content = jsonl_content+d


            file_path = os.path.join(directory, filename)
            bucket.blob(file_path).upload_from_string(jsonl_content)
            logging.info(f"uploaded jsonl {file_path} to bucket {bucket_name}. Full path: gs://{os.path.join(bucket_name,file_path)}")

        
        except Exception as e:
            logging.error(f"Error in jsonl creation: {to_trace_str(e)}")
    
    
    # Main
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the processing of pdfs to png.")
    
    jsonl_filename_icn = f"icn_{uuid}.jsonl"
    
    
    convert_pdf_to_png(src_path, dst_path, project)
    create_jsonl(gcs_path=dst_path, mime_type="image/png", filename=jsonl_filename_icn, project=project)

### Component 3: Creating a BigQuery dataset to save results

In [25]:
@component(packages_to_install=['google-cloud-bigquery'])
def create_bq_results_dataset(project: str, 
                              dataset_id: str):
    """loads csv data in storage to BQ"""
    # Send the dataset to the API for creation, with an explicit timeout.
    # Raises google.api_core.exceptions.Conflict if the Dataset already
    # exists within the project.
    import logging
    from google.cloud import bigquery

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the creation of a BigQuery dataset to store analyses results.")

    bq = bigquery.Client(project=project)
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        bq.get_dataset(dataset_id)  # Make an API request.
        logging.info("Dataset {} already exists".format(dataset_id))
    except Exception as e:
        logging.info("Dataset {} is not found".format(dataset_id))
        dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
        dataset.location = "US"
        logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
    finally:
        logging.info(f"Finished creating or loading dataset {dataset_id}")

### Component 4.1: Creating image classification results table

In [26]:
@component(packages_to_install=['google-cloud-bigquery'])
def create_text_class_results_table(project:str, 
                                    dataset_id:str, 
                                    table_id:str, 
                                    schema:str):
    
    import ast
    from google.cloud import bigquery
    import logging
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the creation of a BQ table to store text classification results.")
    
    bq = bigquery.Client(project=project)
    
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        bq.get_dataset(dataset_id)  # Make an API request.
        logging.info("Dataset {} already exists".format(dataset_id))
    except Exception as e:
        logging.info("Dataset {} is not found".format(dataset_id))
        dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
        dataset.location = "US"
        logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
    finally:
        # create table
        schema = [bigquery.SchemaField(**dct) for dct in ast.literal_eval(schema)]
        table = bigquery.Table(table_id, schema=schema)
        table = bq.create_table(table)
        logging.info(f"Created table {table_id}")
    

### Component 4.2: Performing text classification

In [27]:
@component()
def text_class_predict():
    pass

### Component 4.3: Storing text classification results

In [28]:
@component()
def store_text_class_results():
    pass

### Component 5.1: Creating image classification results table

In [29]:
@component(packages_to_install=['google-cloud-bigquery'])
def create_img_class_results_table(project:str, 
                                    dataset_id:str, 
                                    table_id:str, 
                                    schema:str):
    
    import ast
    from google.cloud import bigquery
    import logging
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the creation of a BQ table to store image classification results.")
    
    bq = bigquery.Client(project=project)
    
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        bq.get_dataset(dataset_id)  # Make an API request.
        logging.info("Dataset {} already exists".format(dataset_id))
    except Exception as e:
        logging.info("Dataset {} is not found".format(dataset_id))
        dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
        dataset.location = "US"
        logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
    finally:
        # create table
        schema = [bigquery.SchemaField(**dct) for dct in ast.literal_eval(schema)]
        table = bigquery.Table(table_id, schema=schema)
        table = bq.create_table(table)
        logging.info(f"Created table {table_id}")


### Component 5.2: Performing image classification

In [30]:
@component()
def img_class_predict():
    pass

### Component 5.3: Storing image classification results

In [31]:
@component()
def store_img_class_results():
    pass

### Component 6.1: Creating object detection results table

In [32]:
@component(packages_to_install=['google-cloud-bigquery'])
def create_obj_detection_results_table(project:str, 
                                       dataset_id:str, 
                                       table_id:str, 
                                       schema:str):
    
    import ast
    from google.cloud import bigquery
    import logging
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the creation of a BQ table t store object detection results.")
    
    
    bq = bigquery.Client(project=project)
    
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        bq.get_dataset(dataset_id)  # Make an API request.
        logging.info("Dataset {} already exists".format(dataset_id))
    except Exception as e:
        logging.info("Dataset {} is not found".format(dataset_id))
        dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
        dataset.location = "US"
        logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
    finally:
        # create table
        schema = [bigquery.SchemaField(**dct) for dct in ast.literal_eval(schema)]
        table = bigquery.Table(table_id, schema=schema)
        table = bq.create_table(table)
        logging.info(f"Created table {table_id}")
    

### Component 6.2: Performing object detection

In [33]:
@component(packages_to_install=['google-cloud-bigquery', 'google-cloud-storage',  'google-cloud-aiplatform'])
def obj_detection_predict(project: str,
                          region: str,
                          bucket_name: str,
                          img_blob: str,
                          objdet_endpoint: str) -> NamedTuple("Outputs", [("predictions", Artifact),]):
    
    # IMPORTS     
    import os
    import tempfile
    import logging
    import traceback as tb
    from collections import namedtuple
    from google.cloud import bigquery
    from google.cloud import storage
    from google.cloud import aiplatform
    from fnmatch import fnmatch
    import base64
    from google.cloud.aiplatform.gapic.schema import predict
    
    
    # AUXILIARY LIBRARIES
    def get_bucket_file_list(bucket_name, fname_template='*'):
        '''!@brief Function that returns the list of files in a bucket.
        @param bucket (string) Bucket name.
        @param fname_template (string) Template for filtering blob names 
        that supports Unix shell-style wildcards. For more info: 
        https://docs.python.org/3/library/fnmatch.html

        @return (list of srtings) List of blob names in a bucket which 
        fullfills template structure.
        '''
        storage_client = storage.Client()
        blobs = storage_client.list_blobs(bucket_name)
        blob_lst = [blob.name for blob in blobs]  
        file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]

        return file_lst
    
    
    def predict_image_classification_sample(
        project: str,
        endpoint_id: str,
        filename: str,
        location: str = "us-central1",
        api_endpoint: str = "us-central1-aiplatform.googleapis.com"):
        
        # The AI Platform services require regional API endpoints.
        client_options = {"api_endpoint": api_endpoint}
        client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
        with open(filename, "rb") as f:
            file_content = f.read()
            print('file: '+ str(file_content))

        # The format of each instance should conform to the deployed model's prediction input schema.
        encoded_content = base64.b64encode(file_content).decode("utf-8")
        print('img encoded: '+ str(encoded_content))
        instance = predict.instance.ImageObjectDetectionPredictionInstance(content=encoded_content).to_value()
        instances = [instance]
        parameters = predict.params.ImageObjectDetectionPredictionParams(confidence_threshold=0.5, max_predictions=5).to_value()
        endpoint = client.endpoint_path(project=project, location=location, endpoint=endpoint_id)
        response = client.predict(endpoint=endpoint, instances=instances, parameters=parameters)
        predictions = response.predictions
        return [dict(prediction) for prediction in predictions]
    

    # MAIN BODY:    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the object detection task.")
    
    client = bigquery.Client()
    files = get_bucket_file_list(bucket_name=f'{bucket_name}',
                                 fname_template=img_blob+'*')
    logging.info(str(files))
    predictions = []
    for file in files:             
        # Downloading the file as a temporal file:
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(file)
        _, path = tempfile.mkstemp()
        blob.download_to_filename(path + '.png')    
        
        #print(str(file))
        
        # Obtaining online prediction:
        preds = predict_image_classification_sample(project=project,
                                                    endpoint_id=objdet_endpoint,
                                                    filename=f'{path}.png',
                                                    location=region,
                                                    api_endpoint='us-central1-aiplatform.googleapis.com')
    
        print(str(preds))
        
        # Parsing prediction:
        objdet_pred = preds[0]['displayNames'][0]
        objdet_confidence = preds[0]['confidences'][0]
        objdet_xmin, objdet_xmax = preds[0]['bboxes'][0][0], preds[0]['bboxes'][0][1]
        objdet_ymin, objdet_ymax = preds[0]['bboxes'][0][2], preds[0]['bboxes'][0][3]
        
        # Storing prediction into the BQ table:
        predictions.append(
            {'file': f'{file}'.split('/')[-1],
             'label': f'{objdet_pred}',
             'score': f'{objdet_confidence}',
             'xmin': f'{objdet_xmin}',
             'xmax': f'{objdet_xmax}',
             'ymin': f'{objdet_ymin}',
             'ymax': f'{objdet_ymax}'}
        )

        logging.info(str(predictions))
        
    logging.info(f"The object detection task has finished successfully .")    
    
    # Creating the named tuple with the results:
    outputs = namedtuple('Outputs',
                         ['predictions'])
    
    return outputs(predictions)

### Component 6.3: Storing object detection results

In [34]:
@component(packages_to_install=['google-cloud-bigquery'])
def store_obj_detection_results(table_id: str,
                                preds: Input[Artifact]):
    
    import logging
    import ast
    from google.cloud import bigquery
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logging.info(f"Starting the storage of the object detection results.")
    
    client = bigquery.Client()
    
    # Parsing the artifact:
    with open(preds.path, "r") as preds_file:
        contents = preds_file.read()
        print(f"generic contents: {contents}")
        print(type(contents))
        
        predictions = ast.literal_eval(contents)

        for prediction in predictions:
            errors = client.insert_rows_json(table_id, [prediction])
            if errors == []:
                logging.info("New row have been added.")
            else:
                logging.info("Encountered errors while inserting rows: {}".format(errors))

            logging.info(f"The object detection results have been stored successfully.")@component()
def store_obj_detection_results():
    pass

## Creating and Compiling the Vertex AI Pipeline

In [35]:
@kfp.dsl.pipeline(name=PIPELINE_NAME, 
                  description='Pipeline that process patents pdf files.',
                  pipeline_root=PIPELINE_ROOT)

def pipeline():
    # Preprocessing pipeline:
#     perform_ocr_on_pdfs_task = perform_ocr_on_pdfs(
#     src_path=f"gs://{BUCKET}/{PDF_BUCKET_PATH}", 
#     dst_path=f"gs://{BUCKET}/{PDF_BUCKET_PATH}",
#     uuid=UUID,
#     project=PROJECT)
    
    transform_pdfs_into_png_task = transform_pdfs_into_png(src_path=f"gs://{BUCKET}/{PDF_BUCKET_PATH}", 
                                                           dst_path=f"gs://{BUCKET}/{PDF_BUCKET_PATH}",
                                                           uuid=UUID,
                                                           project=PROJECT)
    
#     transform_pdfs_into_png_task.after(perform_ocr_on_pdfs_task)

    create_bq_results_dataset_task = create_bq_results_dataset(project=PROJECT, dataset_id=RES_DATASET_ID)
    create_bq_results_dataset_task.after(transform_pdfs_into_png_task)
    
#     # Text classification pipeline:
#     create_text_class_results_table_task = create_text_class_results_table(project=PROJECT, 
#                                                                            dataset_id=RES_DATASET_ID, 
#                                                                            table_id=TCN_RESTABLE_NAME, 
#                                                                            schema=TCN_RESTABLE_SCHEMA)
#     create_text_class_results_table_task.after(create_bq_results_dataset_task)
    
#     text_class_predict_task = text_class_predict()
#     text_class_predict_task.after(create_text_class_results_table_task)
    
#     store_text_class_results_task = store_text_class_results()
#     store_text_class_results_task.after(text_class_predict_task)
    
#     # Image classification pipeline:
#      # Image classification pipeline:
#     create_img_class_results_table_task = create_img_class_results_table(project=PROJECT, 
#                                                                          dataset_id=RES_DATASET_ID, 
#                                                                          table_id=ICN_RESTABLE_NAME, 
#                                                                          schema=ICN_RESTABLE_SCHEMA)
#     create_img_class_results_table_task.after(create_bq_results_dataset_task)
    
#     img_class_predict_task = img_class_predict()
#     img_class_predict_task.after(create_img_class_results_table_task)
    
#     store_img_class_results_task = store_img_class_results()
#     store_img_class_results_task.after(img_class_predict_task)
        
#     # Object detection pipeline:
#     create_obj_detection_results_table_task = create_obj_detection_results_table(project=PROJECT, 
#                                                                                  dataset_id=RES_DATASET_ID, 
#                                                                                  table_id=ODM_RESTABLE_NAME, 
#                                                                                  schema=ODM_RESTABLE_SCHEMA)
#     create_obj_detection_results_table_task.after(create_bq_results_dataset_task)
    
#     obj_detection_predict_task = obj_detection_predict()
#     obj_detection_predict_task.after(create_obj_detection_results_table_task)
    
#     store_obj_detection_results_task = store_obj_detection_results()
#     store_obj_detection_results_task.after(obj_detection_predict_task)

In [36]:
if not os.path.isdir(LOCAL_PIPELINE_PATH):
    os.mkdir(LOCAL_PIPELINE_PATH)

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=LOCAL_PIPELINE_JSON
)

## Launching the Vertex AI Pipeline

In [37]:
# Instantiating an API client object:
# TODO: use the new Vertex AI.
api_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [38]:
response = api_client.create_run_from_job_spec(
    LOCAL_PIPELINE_JSON,
    pipeline_root=f"{PIPELINE_ROOT}",
)