# Prepare pdfs for later in pipeline (Obj Det, Img, text, NER)

- user provides
    - Google Cloud project (input)
    - bucket in GCS of pdfs (input)
    - BQ dataset to write prediction results (output)
        - BQ table: aggregated results (pdf_name, icn_pred, objdet_pred(coords), text_cn, ner1, ner2, ...., ner)
            created with JOIN on pdf_name
        - BQ table: icn_preds (pdf_name, icn_pred)    --> this table is made in icn_predict.ipynb
        - BQ table: objdet_pred (pdf_name, objdet_pred(coords)) --> this table is made in objdet_predict.ipynb
        - BQ table: text_cn (pdf_name, text_cn)    --> this table is made in text_cn_predict.ipynb
        - BQ table: ner (pdf_name, ner1, ner2, ...., ner)
        
- see utils.py for utils functions
        

Steps: 
 1. convert pdf to png and write to bucket (for ICN, ObjDet)
 2. do ocr on pdf and write to bucket 
 3. create dataset 
    

In [3]:
PROJECT = !gcloud config get-value project # returns SList
PROJECT = PROJECT[0] # gets first element in list -> str
REGION = "us-central1"  
MODEL_RESOURCE_NAME = "2393478483993952256"

import os
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION

In [4]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

In [5]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [128]:
from google.cloud import storage
from google.cloud import vision
from google.cloud import aiplatform
import tempfile



from importlib import reload
from pathlib import Path
import pandas as pd
import numpy as np

# for jupyter only
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [7]:
from pdf2image import convert_from_path
import io
import base64
import cv2
from datetime import datetime
import time
import json

In [8]:
logging.info("test if logging works")

03:03:22 INFO:test if logging works


In [132]:
def to_trace_str(e):
    return ''.join(tb.format_exception(None, e, e.__traceback__))

class Utils():
    def __init__(self):
        self.storage_client = storage.Client()
        
    def dismantle_path(self, gcs_path):
        parts = Path(gcs_path).parts
        bucket_idx = 1 if parts[0].startswith("gs") else 0
        filename_idx = -1 if "." in parts[-1] else None

        bucket_name = parts[bucket_idx]
        filename = parts[filename_idx] if filename_idx else ""
        directory = "/".join(parts[bucket_idx:filename_idx] if filename_idx else parts[bucket_idx+1:])
        return bucket_name, directory, filename
        
    
    def convert_pdf_to_png(self, src_path, dst_path):
        """Takes pdfs from src_bucket_name and transforms them into png. Then it saves the result in dst_bucket_name"""
        try:
            logging.info("started conversion pdf -> png")
        
            src_bucket_name, src_directory, _ = self.dismantle_path(src_path)
            dst_bucket_name, dst_directory, _ = self.dismantle_path(dst_path)
            
            src_bucket = self.storage_client.bucket(src_bucket_name)
            dst_bucket = self.storage_client.bucket(dst_bucket_name)

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]

            encoded_img_lst = []
            imgs = []
            logging.info(f"found {len(blob_list)} pdfs in bucket  {src_bucket_name}")

            for b_idx, blob in enumerate(blob_list):
                _, tmp_pdf = tempfile.mkstemp()
                blob.download_to_filename(tmp_pdf)
                logging.info(f"downloaded {b_idx+1} of {len(blob_list)} files")
                image = convert_from_path(tmp_pdf)
                logging.info(f"converted {b_idx+1} of {len(blob_list)} images")
                image = image[0]                # Only the firs page is going to be analyzed.
                image = np.array(image)
                is_success, im_buf_arr = cv2.imencode(".png", image)
                byte_im = im_buf_arr.tobytes()
                filename = os.path.join(dst_directory, blob.name+".png")
                dst_bucket.blob(filename).upload_from_string(byte_im)
                logging.info(f"saved {b_idx+1} of {len(blob_list)} images with filename {filename}")
        
        except Exception as e:
            logging.error(f"Error in method convert_pdf_to_png: {to_trace_str(e)}")
            return False
    
    def ocr(self, src_path, dst_path):
        """Perform optical character recognition in pdf files.
        
        Args
            src_path
            dst_path
        
        Returns
            google.api_core.operation.Operation
            To check if done use method .done()
            
        Link to documentation:  
            https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
            https://cloud.google.com/vision/docs/pdf
        
        """
        try:
            logging.info("started optical character recognition")
        
            src_bucket_name, src_directory, _ = self.dismantle_path(src_path)
            dst_bucket_name, dst_directory, _ = self.dismantle_path(dst_path)
            
            src_bucket = self.storage_client.bucket(src_bucket_name)
            dst_bucket = self.storage_client.bucket(dst_bucket_name)
            
            logging.info(f"src_bucket_name {src_bucket_name}, src_directory {src_directory}")

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]
            
            logging.info(f"found {len(blob_list)} pdf files in bucket {src_bucket_name}")

            client = vision.ImageAnnotatorClient()
            feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
            
            operations = []
            async_requests = []
            
            for b_idx, blob in enumerate(blob_list):
                gcs_source_uri = os.path.join(src_path, blob.name)
                gcs_destination_uri = os.path.join(dst_path, blob.name)

                # source
                gcs_source = vision.GcsSource(uri=gcs_source_uri)
                input_config = vision.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

                # destination
                gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
                output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=1)

                logging.info(f"started ocr for {b_idx} of {len(blob_list)} files")
                async_request = vision.AsyncAnnotateFileRequest(
                    features=[feature], 
                    input_config=input_config,
                    output_config=output_config
                )
                async_requests.append(async_request)

            operation = client.async_batch_annotate_files(requests=async_requests)
            return operation
            
        except Exception as e:
            logging.error(f"Error in method ocr: {to_trace_str(e)}")
            
    def get_extension(self, mime_type):
        if mime_type == "text/plain":
            return ".txt"
        elif mime_type == "image/png":
            return ".png"
        else:
            return ".txt"
        
    def create_jsonl(self, gcs_path, mime_type, filename):
        """create jsonl out of files in bucket
        
        Args
            gcs_path (str): bucket or dir where files are located
            mime_type (str): the files mimetype 
            filename (str): the jsonl filename
        
        Returns
            full path of jsonl
        """
        try:
            bucket_name, directory, _ = self.dismantle_path(gcs_path)
            bucket = self.storage_client.bucket(bucket_name)
            extension = self.get_extension(mime_type)

            blob_list = [blob for blob in list(bucket.list_blobs()) if \
                             os.path.basename(directory) in blob.name and \
                             blob.name.endswith(extension)]

            jsonl_content = ""

            for b_idx, blob in enumerate(blob_list):
                full_path = os.path.join(gcs_path,blob.name)

                d = json.dumps(
                    {
                    "content": full_path,
                    "mimeType": mime_type
                    }
                )+"\n"

                jsonl_content = jsonl_content+d



            bucket.blob(filename).upload_from_string(jsonl_content)
            logging.info(f"uploaded jsonl {filename} to bucket {bucket_name}")

            return os.path.join(gcs_path,filename)
        
        except Exception as e:
            logging.error(f"Error in jsonl creation: {to_trace_str(e)}")
            
    def create_text_files(self, gcs_path):
        try:
            # init bucket
            bucket_name, directory, _ = self.dismantle_path(gcs_path)
            bucket = self.storage_client.bucket(bucket_name)
            blob_list = [blob for blob in list(bucket.list_blobs()) if \
                             os.path.basename(directory) in blob.name and \
                             blob.name.endswith("output-1-to-1.json")]
            
            for b_idx, blob in enumerate(blob_list):
                logging.info(f"creating {b_idx+1} of {len(blob_list)} text files")
                json_string = blob.download_as_string()
                response = json.loads(json_string)
                text = response['responses'][0]['fullTextAnnotation']['text'] 
                txt_path = blob.name.replace("output-1-to-1.json", ".txt")
                text_blob = bucket.blob(txt_path)
                text_blob.upload_from_string(text)
                
            logging.info("finished creating text files")
            
        except Exception as e:
            logging.error(f"Error in method save_result_as_csv_in_storage: {to_trace_str(e)}") 
    
    
    def save_to_storage(self, gcs_path, filename, predictions):
        """converts list of json into df, saves as temp csv file"""
        try:
            # init bucket
            bucket_name, directory, _ = self.dismantle_path(gcs_path)
            bucket = self.storage_client.bucket(bucket_name)

            # create df
            df = pd.DataFrame.from_records(predictions)

            # save as tmpfile
            _, path = tempfile.mkstemp()
            df.to_csv(path, index=False)

            # create new blob
            blob = bucket.blob(filename)

            # upload csv to blob
            full_path = f"{gcs_path}/{filename}"
            logging.info(f"writing csv {full_path} to storage")
            with open(path, "rb") as my_file:
                blob.upload_from_file(my_file)
                
            return full_path
        
        except Exception as e:
            logging.error(f"Error in method save_result_as_csv_in_storage: {to_trace_str(e)}")  
                         
    def load_to_bigquery(self, gcs_path, dataset_id, table_id, schema):
        """loads csv data in storage to BQ"""
        # Send the dataset to the API for creation, with an explicit timeout.
        # Raises google.api_core.exceptions.Conflict if the Dataset already
        # exists within the project.
        try:
            dataset = bigquery.Dataset(dataset_id)
            dataset.location = "US"
            bq.get_dataset(dataset_id)  # Make an API request.
            logging.info("Dataset {} already exists".format(dataset_id))
        except Exception as e:
            logging.info("Dataset {} is not found".format(dataset_id))
            dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
            dataset.location = "US"
            logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
        finally:
            # create bigquery table and upload csv
            job_config = bigquery.LoadJobConfig(
                schema=schema,
                skip_leading_rows=1,
                # The source format defaults to CSV, so the line below is optional.
                source_format=bigquery.SourceFormat.CSV,
                allow_quoted_newlines=True,

            )
            uri = gcs_path

            load_job = bq.load_table_from_uri(
                uri, table_id, job_config=job_config
            )  # Make an API request.

            load_job.result()  # Waits for the job to complete.

            destination_table = bq.get_table(table_id)  # Make an API request.
            print("Loaded {} rows.".format(destination_table.num_rows))
            
    def run_automl_image_batch(self, project, region, model_resource_name, job_display_name, gcs_source, gcs_destination):
        job = self.create_batch_prediction_job(
            project, 
            region, 
            model_resource_name=model_resource_name, 
            job_display_name=job_display_name, 
            gcs_source=gcs_source, 
            gcs_destination=gcs_destination, 
            sync=True
            )
        
        logging.info(f"job started {type(job)} for automl image")
        
        bucket_name, directory, _ = self.dismantle_path(gcs_destination)
        logging.info(f"bucket name {bucket_name}")
        bucket = self.storage_client.bucket(bucket_name)

        # read results 
        results = []

        blob_list  = [blob for blob in list(bucket.list_blobs()) if os.path.basename(gcs_destination) in blob.name and blob.name.endswith(".jsonl")]
        for blob in blob_list:
            blob_str = blob.download_as_string().decode("utf-8") 
            responses = []
            for line in blob_str.split("\n")[:-1]:
                responses.append(json.loads(str(line)))

            for response in responses:
                
                results.append({
                    'file': response["instance"]["content"][:-4], # "gs://bucket/text.txt" TODO: check if original path is needed
                    'subject': response["prediction"]["displayNames"][0],
                    'score':  response["prediction"]["confidences"][0],
                    })

        return results
        
            
    def run_automl_text_batch(self, project, region, model_resource_name, job_display_name, gcs_source, gcs_destination):

        job = self.create_batch_prediction_job(
            project, 
            region, 
            model_resource_name=model_resource_name, 
            job_display_name=job_display_name, 
            gcs_source=gcs_source, 
            gcs_destination=gcs_destination, 
            sync=True
            )

        logging.info(f"job started {type(job)} for automl text")
        
        bucket_name, directory, _ = self.dismantle_path(gcs_destination)
        logging.info(f"bucket name {bucket_name}")
        bucket = self.storage_client.bucket(bucket_name)

        # read results 
        results = []

        blob_list  = [blob for blob in list(bucket.list_blobs()) if os.path.basename(gcs_destination) in blob.name and blob.name.endswith(".jsonl")]
        for blob in blob_list:
            blob_str = blob.download_as_string().decode("utf-8") 
            responses = []
            for line in blob_str.split("\n")[:-1]:
                responses.append(json.loads(str(line)))

            for response in responses:
                results.append({
                    'file': response["instance"]["content"][:-4], # "gs://bucket/text.txt" TODO: check if original path is needed
                    'subject': response["prediction"]["displayNames"][0],
                    'score':  response["prediction"]["confidences"][0],
                    })

        return results


    def create_batch_prediction_job(
        self,
        project,
        location,
        model_resource_name,
        job_display_name,
        gcs_source,
        gcs_destination,
        sync = True
    ):
        aiplatform.init(project=project, location=location)

        my_model = aiplatform.Model(model_resource_name)

        batch_prediction_job = my_model.batch_predict(
            job_display_name=job_display_name,
            gcs_source=gcs_source,
            gcs_destination_prefix=gcs_destination,
            sync=True
        )

        batch_prediction_job.wait()
        
        logging.info(f"state type: {type(batch_prediction_job.state)}")

        logging.info(batch_prediction_job.display_name)
        logging.info(batch_prediction_job.resource_name)
        logging.info(batch_prediction_job.state)
        return batch_prediction_job

In [141]:
class Pipeline():
    def __init__(self, dataset_id=None):
        self.utils = Utils()
        self.uuid = datetime.now().strftime('%y%m%d_%H%M%S') #str
        
        self.project = "qwiklabs-gcp-00-373ac55d0e0a"
        
        self.region = "us-central1"  
        
        
        self.dataset_id = dataset_id if dataset_id else f"{self.project}.docprocessing_"+self.uuid
        
        # find ids via !gcloud ai models list
        self.tcn_model_resource_name = "2393478483993952256"
        self.icn_model_resource_name = "8925034949820547072"
        
        
        self.table_id_tcn = f"{self.dataset_id}.tcn" 
        self.table_id_icn = f"{self.dataset_id}.icn" 
        
        self.tcn_schema = [
                    bigquery.SchemaField("file", "STRING", mode="REQUIRED", description="File path."),
                    bigquery.SchemaField("subject", "STRING", mode="REQUIRED", description="Predicted class."),
                    bigquery.SchemaField("score", "FLOAT", mode="REQUIRED", description="Confidence of the prediction."),
                ]
        
        self.icn_schema = [
                    bigquery.SchemaField("image_name", "STRING", mode="REQUIRED", description='Name of the image analyzed.'),
                    bigquery.SchemaField("label", "STRING", mode="REQUIRED", description='Predicted class. It can be US or EU'),
                    bigquery.SchemaField("confidence", "FLOAT", mode="REQUIRED", description='Confidence of the prediction.'),
                ]
        
        
        
        
    def start_pipeline(self, src_path):
        # TODO: multiprocessing??
        logging.info(f"started pipeline")
        
        # save everything in the same bucket
        dst_path = src_path
        jsonl_filename_tcn = f"tcn_{self.uuid}.jsonl"
        jsonl_filename_icn = f"icn_{self.uuid}.jsonl"
        
        # create png
        jsonl_path_icn = self.preprocess_pdf_to_png(src_path, dst_path, jsonl_filename_icn)
        

        # create ocr
        jsonl_path_tcn = self.preprocess_ocr(src_path, dst_path, jsonl_filename_tcn)
        
        

        # prediction
        self.text_classification_task(
            src_path=jsonl_path_tcn, 
            dst_path=dst_path, 
            job_display_name="job_tcn_"+self.uuid)

        self.image_classification_task(
            src_path=jsonl_path_icn, 
            dst_path=dst_path, 
            job_display_name="job_icn_"+self.uuid)
        
       
        
        logging.info(f"finished pipelines")
        
    def preprocess_pdf_to_png(self, src_path, dst_path, jsonl_filename):
        self.utils.convert_pdf_to_png(src_path, dst_path)
        
        return self.utils.create_jsonl(gcs_path=dst_path, mime_type="image/png", filename=jsonl_filename)
    
    def preprocess_ocr(self, src_path, dst_path, jsonl_filename):
        ocr_operation = self.utils.ocr(src_path, dst_path)
        
        while not ocr_operation.done():
            logging.info("wait for ocr to finish")
            time.sleep(5)
            
        self.utils.create_text_files(dst_path)
        return self.utils.create_jsonl(gcs_path=dst_path, mime_type="text/plain", filename=jsonl_filename)
        
        
    def text_classification_task(self, src_path, dst_path, job_display_name):
        gcs_source = src_path
        gcs_destination = os.path.join(dst_path, job_display_name)
        
        logging.info(f"starting tcn with gsc_source {gcs_source} and gcs_destination {gcs_destination}")
        
        if not gcs_destination.startswith("gs://"):
            gcs_destination = "gs://" + gcs_destination
        
        predictions = self.utils.run_automl_text_batch(self.project, self.region, self.tcn_model_resource_name, job_display_name, gcs_source, gcs_destination)
    
        logger.info("save tcn predictions to storage")
        predictions_filename = "predictions_tcn_"+self.uuid+".csv"
        path_to_csv = self.utils.save_to_storage(dst_path, predictions_filename, predictions)

        # Step 5: Load storage result in BQ
        logger.info("load results into BigQuery")
        status = self.utils.load_to_bigquery(path_to_csv, self.dataset_id, self.table_id_tcn, self.tcn_schema)
        logging.info(f"finished task with status {status}")
    
    def image_classification_task(self, src_path, dst_path, job_display_name):
        gcs_source = src_path
        gcs_destination = os.path.join(dst_path, job_display_name)
        
        logging.info(f"starting icn with gsc_source {gcs_source} and gcs_destination {gcs_destination}")
        
        if not gcs_destination.startswith("gs://"):
            gcs_destination = "gs://" + gcs_destination
            
        predictions = self.utils.run_automl_image_batch(self.project, self.region, self.icn_model_resource_name, job_display_name, gcs_source, gcs_destination)
        
        logger.info("save icn predictions to storage")
        predictions_filename = "predictions_icn_"+self.uuid+".csv"
        path_to_csv = self.utils.save_to_storage(dst_path, predictions_filename, predictions)

        # Step 5: Load storage result in BQ
        logger.info("load results into BigQuery")
        status = self.utils.load_to_bigquery(path_to_csv, self.dataset_id, self.table_id_icn, self.icn_schema)
        logging.info(f"finished task with status {status}")

    
    def odet(self):
        pass
        


# Test

## Parameters

In [112]:
src_path = "gs://2021_08_16_tcn_dev"
dst_path = "gs://2021_08_16_tcn_dev"

In [142]:
pipeline = Pipeline(dataset_id="qwiklabs-gcp-00-373ac55d0e0a.docprocessing_210818_181824")
# pipeline.start_pipeline(src_path)

# Labeled Patents - Vertex AI Pipeline

## Importing Auxiliary Libraries

In [5]:
import os
from datetime import datetime

#!pip install --upgrade kfp
import kfp
from google.cloud import aiplatform
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google import experimental
from kfp.v2.google.client import AIPlatformClient

print('Kubeflow pipelines version: {}'.format(kfp.__version__))

Kubeflow pipelines version: 1.7.1


## Setting Notebook Inputs

In [7]:
UUID = datetime.now().strftime('%y%m%d_%H%M%S') #str
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'

BUCKET = 'patents_pipetest'
PDF_BUCKET_PATH = 'pdf'

RES_DATASET_NAME = 'docprocessing_' + UUID
RES_DATASET_ID = f'{PROJECT}.{RES_DATASET_NAME}'

TCN_MODEL_NAME = '2393478483993952256'
TCN_RESTABLE_NAME = f'{RES_DATASET_ID}.tcn'
TCN_RESTABLE_SCHEMA = [('file', 'STRING', 'REQUIRED', 'File path.'),
                       ('subject', 'STRING', 'REQUIRED', 'Predicted class.'),
                       ('score', 'FLOAT',  'REQUIRED', 'Confidence of the prediction.')]


src_path = "gs://2021_08_16_tcn_dev"
dst_path = "gs://2021_08_16_tcn_dev"



PIPELINE_NAME = 'process-pdf-patents'
PIPELINE_ROOT = f"gs://{BUCKET}/labeled_patents/pipeline_root"
LOCAL_PIPELINE_PATH = './vertex_pipelines'
LOCAL_PIPELINE_JSON = os.path.join(LOCAL_PIPELINE_PATH, 'labeled_patents_pipeline2.json')

RESULTS_BQ_DATASET='demo_dataset'
RESULTS_OBJDET_TABLE='objdet'



MODEL_DISPLAY_NAME=f"labpat_model"
MACHINE_TYPE="n1-standard-16"
REPLICA_COUNT=1
DOCKER_IMAGE_URI_CREATE_BQDATASET="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-3:latest"


os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET 
os.environ['PDF_BUCKET_PATH'] = PDF_BUCKET_PATH

**Copying some demo files into the Bucket**

In [8]:
!gsutil -m cp gs://2021_08_16_tcn_dev/*.pdf gs://$BUCKET/$PDF_BUCKET_PATH

Copying gs://2021_08_16_tcn_dev/med_tech_8.pdf [Content-Type=application/pdf]...
Copying gs://2021_08_16_tcn_dev/computer_vision_1.pdf [Content-Type=application/pdf]...
Copying gs://2021_08_16_tcn_dev/us_076.pdf [Content-Type=application/pdf]...    
/ [3/3 files][168.9 KiB/168.9 KiB] 100% Done                                    
Operation completed over 3 objects/168.9 KiB.                                    


## Defining Vertex AI Components

### Component 1: Performing OCR on PDFs

In [42]:
@component(packages_to_install=['google-cloud-storage',  'google-cloud-vision'])
def perform_ocr_on_pdfs(src_path: str, 
                        dst_path: str,
                        uuid: str):
    
    # IMPORTS:
    import os
    import logging
    import traceback as tb
    from pathlib import Path
    from google.cloud import storage
    from google.cloud import vision
    # from google.cloud import aiplatform

    
    # AUXILIARY FUNCTIONS:
    def to_trace_str(e):
        return ''.join(tb.format_exception(None, e, e.__traceback__))   
    
    
    def dismantle_path(self, gcs_path):
        parts = Path(gcs_path).parts
        bucket_idx = 1 if parts[0].startswith("gs") else 0
        filename_idx = -1 if "." in parts[-1] else None

        bucket_name = parts[bucket_idx]
        filename = parts[filename_idx] if filename_idx else ""
        directory = "/".join(parts[bucket_idx:filename_idx] if filename_idx else parts[bucket_idx+1:])
        return bucket_name, directory, filename
    
    
    def ocr(self, src_path, dst_path):
        """Perform optical character recognition in pdf files.
        
        Args
            src_path
            dst_path
        
        Returns
            google.api_core.operation.Operation
            To check if done use method .done()
            
        Link to documentation:  
            https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
            https://cloud.google.com/vision/docs/pdf
        
        """
        try:
            logging.info("started optical character recognition")
            
            src_bucket_name, src_directory, _ = dismantle_path(src_path)
            dst_bucket_name, dst_directory, _ = dismantle_path(dst_path)
            
            storage_client = storage.Client()
            src_bucket = storage_client.bucket(src_bucket_name)
            dst_bucket = storage_client.bucket(dst_bucket_name)
            
            logging.info(f"src_bucket_name {src_bucket_name}, src_directory {src_directory}")

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]
            
            logging.info(f"found {len(blob_list)} pdf files in bucket {src_bucket_name}")

            client = vision.ImageAnnotatorClient()
            feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
            
            operations = []
            async_requests = []
            
            for b_idx, blob in enumerate(blob_list):
                gcs_source_uri = os.path.join(src_path, blob.name)
                gcs_destination_uri = os.path.join(dst_path, blob.name)

                # source
                gcs_source = vision.GcsSource(uri=gcs_source_uri)
                input_config = vision.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

                # destination
                gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
                output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=1)

                logging.info(f"started ocr for {b_idx} of {len(blob_list)} files")
                async_request = vision.AsyncAnnotateFileRequest(
                    features=[feature], 
                    input_config=input_config,
                    output_config=output_config
                )
                async_requests.append(async_request)

            operation = client.async_batch_annotate_files(requests=async_requests)
            return operation
            
        except Exception as e:
            logging.error(f"Error in method ocr: {to_trace_str(e)}")
    
    
    def preprocess_ocr(self, src_path, dst_path, jsonl_filename):
        ocr_operation = self.utils.ocr(src_path, dst_path)
        
        while not ocr_operation.done():
            logging.info("wait for ocr to finish")
            time.sleep(5)
            
        self.utils.create_text_files(dst_path)
        return self.utils.create_jsonl(gcs_path=dst_path, mime_type="text/plain", filename=jsonl_filename)
    
    
    def text_classification_task(src_path, dst_path, job_display_name):
        gcs_source = src_path
        gcs_destination = os.path.join(dst_path, job_display_name)
        
        logging.info(f"starting tcn with gsc_source {gcs_source} and gcs_destination {gcs_destination}")
        
        if not gcs_destination.startswith("gs://"):
            gcs_destination = "gs://" + gcs_destination
        
        predictions = self.utils.run_automl_text_batch(self.project, self.region, self.tcn_model_resource_name, job_display_name, gcs_source, gcs_destination)
    
        logger.info("save tcn predictions to storage")
        predictions_filename = "predictions_tcn_"+self.uuid+".csv"
        path_to_csv = self.utils.save_to_storage(dst_path, predictions_filename, predictions)

        # Step 5: Load storage result in BQ
        logger.info("load results into BigQuery")
        status = self.utils.load_to_bigquery(path_to_csv, self.dataset_id, self.table_id_tcn, self.tcn_schema)
        logging.info(f"finished task with status {status}")
    
    
    
    # PIPELINE COMPONENT MAIN CODE:
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    logging.info(f"Starting the processing of pdfs with the OCR functionality of Google Vision API.")
        
    # save everything in the same bucket
    dst_path = src_path
    jsonl_filename_tcn = f"tcn_{uuid}.jsonl"
        
    # create ocr
    jsonl_path_tcn = preprocess_ocr(src_path, dst_path, jsonl_filename_tcn)
        
    # prediction
    text_classification_task(
            src_path=jsonl_path_tcn, 
            dst_path=dst_path, 
            job_display_name="job_tcn_"+ uuid)
        
    logging.info(f"PDF files have succesfully processed with Google Vison OCR")
        


### Component 2: PDF to PNG conversion

In [43]:
@component()
def transform_pdfs_into_png():
    pass

### Component 3: Creating a BigQuery dataset to save results

In [44]:
@component()
def create_bq_results_dataset():
    pass

### Component 4.1: Creating image classification results table

In [45]:
@component()
def create_text_class_results_table():
    pass

### Component 4.2: Performing text classification

In [46]:
@component()
def text_class_predict():
    pass

### Component 4.3: Storing text classification results

In [47]:
@component()
def store_text_class_results():
    pass

### Component 5.1: Creating image classification results table

In [48]:
@component()
def create_img_class_results_table():
    pass

### Component 5.2: Performing image classification

In [49]:
@component()
def img_class_predict():
    pass

### Component 5.3: Storing image classification results

In [50]:
@component()
def store_img_class_results():
    pass

### Component 6.1: Creating object detection results table

In [51]:
@component()
def create_obj_detection_results_table():
    pass

### Component 6.2: Performing object detection

In [52]:
@component()
def obj_detection_predict():
    pass

### Component 6.3: Storing object detection results

In [53]:
@component()
def store_obj_detection_results():
    pass

## Creating and Compiling the Vertex AI Pipeline

In [54]:
@kfp.dsl.pipeline(name=PIPELINE_NAME, 
                  description='Pipeline that process patents pdf files.',
                  pipeline_root=PIPELINE_ROOT)

def pipeline():
    # Preprocessing pipeline:
    perform_ocr_on_pdfs_task = perform_ocr_on_pdfs()
    
    transform_pdfs_into_png_task = transform_pdfs_into_png()
    transform_pdfs_into_png_task.after(perform_ocr_on_pdfs_task)

    create_bq_results_dataset_task = create_bq_results_dataset()
    create_bq_results_dataset_task.after(transform_pdfs_into_png_task)
    
    # Text classification pipeline:
    create_text_class_results_table_task = create_text_class_results_table()
    create_text_class_results_table_task.after(create_bq_results_dataset_task)
    
    text_class_predict_task = text_class_predict()
    text_class_predict_task.after(create_text_class_results_table_task)
    
    store_text_class_results_task = store_text_class_results()
    store_text_class_results_task.after(text_class_predict_task)
    
    # Image classification pipeline:
    create_img_class_results_table_task = create_img_class_results_table()
    create_img_class_results_table_task.after(create_bq_results_dataset_task)
    
    img_class_predict_task = img_class_predict()
    img_class_predict_task.after(create_img_class_results_table_task)
    
    store_img_class_results_task = store_img_class_results()
    store_img_class_results_task.after(img_class_predict_task)
        
    # Object detection pipeline:
    create_obj_detection_results_table_task = create_obj_detection_results_table()
    create_obj_detection_results_table_task.after(create_bq_results_dataset_task)
    
    obj_detection_predict_task = obj_detection_predict()
    obj_detection_predict_task.after(create_obj_detection_results_table_task)
    
    store_obj_detection_results_task = store_obj_detection_results()
    store_obj_detection_results_task.after(obj_detection_predict_task)
    
    

In [55]:
if not os.path.isdir(LOCAL_PIPELINE_PATH):
    os.mkdir(LOCAL_PIPELINE_PATH)

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=LOCAL_PIPELINE_JSON
)

## Launching the Vertex AI Pipeline

In [56]:
# Instantiating an API client object:
# TODO: use the new Vertex AI.
api_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [57]:
response = api_client.create_run_from_job_spec(
    LOCAL_PIPELINE_JSON,
    pipeline_root=f"{PIPELINE_ROOT}",
)