In [1]:
#TODO LIST
#1- Change reading files to filter pdfs.
#2- Access to the image classification end point
#3- Create a JSON input for the endpoint (with the image)
#4- Retrieve prediction
#5- Store the prediction in a new BQ table


# Predictions Pipeline

## 1. Notebook Configuration


### 1.1. Loading Necessary Libraries

In [2]:
# General libraries:
import os
import io
#import glob
#import numpy as np
#import pandas as pd

# Dealing with images:
#import cv2
#import matplotlib.pyplot as plt

# Google APIs:
from google.cloud import storage
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# Libraries for string filtering:
from fnmatch import fnmatch

# Libraries for image encoding
import io
import base64

# Specific PDF libraries:
#!conda install -c conda-forge poppler
# !sudo apt-get install -y poppler-data
# !sudo apt-get install -y poppler-utils
# !pip install pdf2image
from pdf2image import convert_from_path

### 1.2. Setting Notebook Inputs
#### 1.2.1 Google Cloud Settings

In [3]:
!gcloud config list

[ai]
region = us-central1
[compute]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


In [4]:
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'
BUCKET = 'qwiklabs-gcp-00-373ac55d0e0a'

TEMP_FOLDER = './temp'
#PDF_FOLDER = os.path.join(TEMP_FOLDER, 'pdf')
#PNG_FOLDER = os.path.join(TEMP_FOLDER, 'png')
#CSV_FOLDER = os.path.join(TEMP_FOLDER, 'csv')

os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET

#### 1.2.2. Image Classification Endpoint

In [5]:
IC_ENDPOINT_ID="7257673944809865216"
IC_PROJECT_ID="136021895401"
IC_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

#### 1.2.3. Object Detection Endpoint

In [6]:
OD_ENDPOINT_ID="2074030773706424320"
OD_PROJECT_ID="136021895401"
OD_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

## 2 Auxiliary Functions

In [35]:
def download_files_from_bucket(bucket_name, dest_folder):
    '''@brief! Function that downloads a list of files from a bucket.

    @param: bucket: (string) Bucket name.
    @param: dest_folder: (string) Folder where files are downloaded.
    '''
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
        
    new_file_lst = []
    # Instantiating client:
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/pdf/") and blob.name.endswith(".pdf")]

    # Saving blob into the destination folder:
    for blob in blob_list:
        # Saving blob into a filename:
        _, name = os.path.split(blob.name)
        new_fname = os.path.join(dest_folder, name)
        blob.download_to_filename(new_fname)
        new_file_lst.append(new_fname)
    
    # TODO: A check of the downloaded files should be performed!! Maybe is just 
    # reading the files of the folder since if it is a temporal folder, every time
    # the pipeline is executed, the folder is created empty:
    #os.listdir(dest_folder) or similar
    print('Number of files downloaded: {:d}'.format(len(new_file_lst)))
    
    return True

In [38]:
def encode_images_in_path(path):
    '''@brief! Function to encode an image of each pdf to be used as instance 
    for a AutoML mode.
        
    @param: file_lst (list of strings) PDF file names to be transformed.
    '''
    file_lst = [os.path.join(path, file) for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
    
    encoded_img_lst = []
    for file in file_lst:
        image = convert_from_path(file)
        image = image[0]                # Only the firs page is going to be analyzed.
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()
        encoded_img_lst.append(base64.b64encode(img_byte_arr).decode("utf-8"))

    return encoded_img_lst

### 2. Pipeline Functional

In [44]:
# download files to temp
dest_folder = "./temp"
# if download_files_from_bucket(BUCKET, dest_folder):
    # transform to images
imgs = encode_images_in_path(dest_folder)

In [None]:
# TODO: save images to temp
# TODO: upload to storage

In [51]:

import json
gcs_img_path = f"gs:/{PROJECT}/{BUCKET}/labeled_patents/images"
fp = "images_icn.jsonl"

def save_jsonl(fp, json_file):
    # needs .jl suffix
    d = json.dumps(json_file)+"\n"
    d = d.encode('utf8')
    try:
        with open(fp, "ab") as f:
            f.write(d)
    except Exception as e:
        print(f"[ERROR]: {e}\n{sys.exc_info()}\n{traceback.format_exc()}")

def create_jsonl(gcs_img_path,fp):
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET)
    # create jsonl
    blob_list  = [blob.name for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/images") and blob.name.endswith(".png")]
    
    for filename in blob_list:
        temp_json = {"content": f"gs://{BUCKET}/{filename}", "mimeType": "image/png"}
        save_jsonl(fp, temp_json)
    
create_jsonl(gcs_img_path, fp)   


In [57]:
!gsutil -m cp ./images_icn.jsonl gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents

Copying file://./images_icn.jsonl [Content-Type=application/octet-stream]...
/ [1/1 files][ 44.8 KiB/ 44.8 KiB] 100% Done                                    
Operation completed over 1 objects/44.8 KiB.                                     


In [58]:
import base64

from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict


def create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds',
):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=True,
    )

    batch_prediction_job.wait()

    print(batch_prediction_job.display_name)
    print(batch_prediction_job.resource_name)
    print(batch_prediction_job.state)
    return batch_prediction_job

create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds',
)

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/136021895401/locations/us-central1/batchPredictionJobs/1748166313559195648
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/136021895401/locations/us-central1/batchPredictionJobs/1748166313559195648')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/1748166313559195648?project=136021895401
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/136021895401/locations/us-central1/batchPredictionJobs/1748166313559195648 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/136021895401/locations/us-central1/batchPredictionJobs/1748166313559195648 current state:
JobState.JOB_STAT

<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f370195f850> 
resource name: projects/136021895401/locations/us-central1/batchPredictionJobs/1748166313559195648