In [1]:
#TODO LIST
#1- Change reading files to filter pdfs.
#2- Access to the image classification end point
#3- Create a JSON input for the endpoint (with the image)
#4- Retrieve prediction
#5- Store the prediction in a new BQ table


# Object Predictions Pipeline

## 1. Notebook Configuration


### 1.1. Loading Necessary Libraries

In [29]:
# General libraries:
import os
import io
#import glob
import numpy as np
import pandas as pd

# Dealing with files:
!pip install jsonlines
import jsonlines
import json

# Dealing with images:
#import cv2
#import matplotlib.pyplot as plt

# Google APIs:
from google.cloud import storage
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# Libraries for string filtering:
from fnmatch import fnmatch

# Libraries for image encoding
import io
import base64

# Specific PDF libraries:
#!conda install -c conda-forge poppler
!sudo apt-get install -y poppler-data
!sudo apt-get install -y poppler-utils
!pip install pdf2image
from pdf2image import convert_from_path

Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-data is already the newest version (0.4.9-2).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-utils is already the newest version (0.71.0-5).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


### 1.2. Setting Notebook Inputs
#### 1.2.1 Google Cloud Settings

In [30]:
!gcloud config list

[ai]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


In [31]:
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'
BUCKET = 'qwiklabs-gcp-00-373ac55d0e0a'

TEMP_FOLDER = './temp'
RESULTS_CSV = 'img_class_results.csv'
PREDICTION_MODE = 'BATCH' # 'ONLINE would be another possibility, but it is not implemented.'


#PDF_FOLDER = os.path.join(TEMP_FOLDER, 'pdf')
#PNG_FOLDER = os.path.join(TEMP_FOLDER, 'png')
#CSV_FOLDER = os.path.join(TEMP_FOLDER, 'csv')

os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET

In [38]:
!gcloud ai endpoints list

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ENDPOINT_ID          DISPLAY_NAME
3651416543192940544  text_classification_endpoint_V1
2074030773706424320  ObjectDetectionV1
7257673944809865216  image_classification_endpoint
6387142210587983872  mnist_endpoint_20210802_154025
4739387696923803648  babyweight_endpoint_20210730_125424
5884427902182752256  babyweight_endpoint_20210730_124945
61273583992700928    pipelines-EndpointCreate-20210727125838


In [39]:
!gcloud ai models list

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
2393478483993952256  text_classification
3409814256151953408  object_detection_patent_figures
8925034949820547072  docprocessing_2021811144149
2880236679656898560  hacker_news_titles_automl
886021654033989632   mnist_20210802_154025
2243012516756062208  babyweight_model_20210730_125424
8763802564723474432  babyweight_model_20210730_124945
5534440156922118144  babyweight_automl_2021728151029
5491655960462098432  pipelines-ModelUpload-20210727125838
657604710433292288   taxifare-20210721144351


#### 1.2.2. Image Classification Endpoint

In [4]:
IC_ENDPOINT_ID="7257673944809865216"
IC_PROJECT_ID="136021895401"
IC_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

#### 1.2.3. Object Detection Endpoint

In [40]:
OD_ENDPOINT_ID="2074030773706424320"
# OD_PROJECT_ID="136021895401"  
OD_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

## 2 Auxiliary Functions

In [41]:
def get_bucket_file_list(bucket_name, fname_template='*'):
    '''!@brief Function that returns the list of files in a bucket.
    @param bucket (string) Bucket name.
    @param fname_template (string) Template for filtering blob names 
    that supports Unix shell-style wildcards. For more info: 
    https://docs.python.org/3/library/fnmatch.html
            
    @return (list of srtings) List of blob names in a bucket which 
    fullfills template structure.
    '''
    # Instantiating client:
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)
    
    # Listing all the blobs in a bucket:
    blob_lst = [blob.name for blob in blobs]

    # Filtering blob names with the template format given:  
    file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]
    
    return file_lst

In [42]:
def clean_bucket(bucket_name, filter =['xxsdsdsds']):
    # bucket_name = "your-bucket-name"
    # blob_name = "your-object-name"

    storage_client = storage.Client()
    
    bucket = storage_client.bucket(bucket_name)
    for file in filter:
        blob = bucket.blob(file)
        blob.delete()
        print("Blob {} deleted.".format(file))
        

In [43]:
def download_files_from_bucket(bucket_name, dest_folder, source_folder="labeled_patents/pdf/", ext = ".pdf" ):
    '''@brief! Function that downloads a list of files from a bucket.

    @param bucket: (string) Bucket name.
    @param dest_folder: (string) Folder where files are downloaded.
    '''
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
        
    new_file_lst = []
    # Instantiating client:
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.startswith(source_folder) and blob.name.endswith(ext)]

    # Saving blob into the destination folder:
    for blob in blob_list:
        # Saving blob into a filename:
        _, name = os.path.split(blob.name)
        new_fname = os.path.join(dest_folder, name)
        blob.download_to_filename(new_fname)
        new_file_lst.append(new_fname)
    
    # TODO: A check of the downloaded files should be performed!! Maybe is just 
    # reading the files of the folder since if it is a temporal folder, every time
    # the pipeline is executed, the folder is created empty:
    #os.listdir(dest_folder) or similar
    print('Number of files downloaded: {:d}'.format(len(new_file_lst)))
    
    return new_file_lst

In [9]:
def encode_images_in_path(path):
    '''@brief! Function to encode an image of each pdf to be used as instance 
    for a AutoML mode.
        
    @param file_lst (list of strings) PDF file names to be transformed.
    '''
    file_lst = [os.path.join(path, file) for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
    
    encoded_img_lst = []
    for file in file_lst:
        image = convert_from_path(file)
        image = image[0]                # Only the firs page is going to be analyzed.
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()
        encoded_img_lst.append(base64.b64encode(img_byte_arr).decode("utf-8"))

    return encoded_img_lst

In [44]:
# Functions to create JSONL files for instance creation:
# WATCH OUT!! Hardcoded values!!
def save_jsonl(fp, json_file):
    # needs .jl suffix
    d = json.dumps(json_file)+"\n"
    d = d.encode('utf8')
    try:
        with open(fp, "ab") as f:
            f.write(d)
    except Exception as e:
        print(f"[ERROR]: {e}\n{sys.exc_info()}\n{traceback.format_exc()}")

def create_jsonl(gcs_img_path,fp):
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET)
    # create jsonl
    blob_list  = [blob.name for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/images") and blob.name.endswith(".png")]
    
    for filename in blob_list:
        temp_json = {"content": f"gs://{BUCKET}/{filename}", "mimeType": "image/png"}
        save_jsonl(fp, temp_json)

In [45]:
# Launching batch predictions:
def create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds'):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=True,
    )

    batch_prediction_job.wait()

    print(batch_prediction_job.display_name)
    print(batch_prediction_job.resource_name)
    print(batch_prediction_job.state)
    return batch_prediction_job

In [57]:
def read_imgclass_results_from_jsonl(filename):
   
    # Creating an empty dataframe to store the image prediction results:
    results_df = pd.DataFrame(columns=['image_name', 'label', 'confidence', 'x1', 'y1', 'x2', 'y2'])

    
    
    # Reading the JSONL file and processing each JSON:
    with jsonlines.open(filename, 'r') as file:
        for i, line in enumerate(file):
            # Extracting results from the jsonl file:
            _, image_name = os.path.split(line['instance']['content'])
            confidence = line['prediction']['confidences'][0]
            label = line['prediction']['displayNames'][0]
            x1,x2,y1,y2 = line['prediction']['bboxes'][0]
                                  
            # Storing results into a dataframe:
            results_df.loc[i, 'image_name'] = image_name 
            results_df.loc[i, 'label'] = label
            results_df.loc[i, 'confidence'] = confidence
            results_df.loc[i, 'x1'] = x1
            results_df.loc[i, 'x2'] = x2
            results_df.loc[i, 'y1'] = y1
            results_df.loc[i, 'y2'] = y2
            
        
    return results_df

In [None]:
{"instance":
 {"content":"gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images/espacenet_en77.png","mimeType":"image/png"},
 "prediction":{"ids":["4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792","4025440712148385792"],"displayNames":["figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure","figure"],"confidences":[0.76443034,0.52859145,0.061091945,0.047826927,0.04333216,0.041156087,0.024904016,0.016526928,0.0154661825,0.009587449,0.0059969747,0.0049217422,0.003716267,0.0018125111,0.0014895669,9.770558E-4,6.6138944E-4,5.448992E-4,5.160438E-4,4.719642E-4,3.8369338E-4,1.9644677E-4,1.17846575E-4,9.965644E-5,9.641395E-5,4.864133E-5,4.8306472E-5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"bboxes":[[0.52033967,0.9044393,0.5511127,0.86246306],[0.41000938,0.9396043,0.53035796,0.93202925],[0.0,0.83526015,0.75827837,0.9620595],[0.21463868,1.0,0.59826696,0.8279809],[0.0037953723,0.84061676,0.8309676,0.9734295],[0.0,0.56297934,0.7997256,0.95936674],[0.3143646,1.0,0.7920224,0.9654812],[0.2724947,1.0,0.8544905,0.9779967],[0.19053939,1.0,0.6729741,0.9295645],[0.050835077,0.5335755,0.0,0.15947065],[0.05325192,0.77883846,0.6753786,0.8990497],[0.006186412,0.41803375,0.0,0.24027358],[0.0,0.9708189,0.001218386,0.13738793],[0.0,0.91130924,0.0,0.31182438],[0.37795317,0.95148355,0.5103226,0.7134304],[0.11684726,0.81253517,0.58722883,0.8011121],[0.13189778,0.9944205,0.0,0.19389042],[0.3499291,0.9611198,0.46038198,0.65194726],[0.010287788,0.2654363,0.0,0.4094497],[0.2323548,0.95879376,0.42165536,0.6178679],[0.10978146,0.76506,0.51294935,0.7306119],[0.24385671,0.953726,0.06038098,0.36443573],[0.07948293,0.76043177,0.070823476,0.41676348],[0.05707376,0.71713334,0.41747388,0.64191526],[0.33629754,0.9581925,0.31212947,0.5304564],[0.3047768,0.9329275,0.20252322,0.45437324],[0.041961167,0.67546207,0.3167929,0.5543516],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]]}}

In [47]:
def upload_file_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """!@brief Function that uploads a file to a bucket.
    
    @param bucket_name (string) ID/name of the bucket.
    @param source_file_name (string) Path to the file to be uploaded.
    @param destination_blob_name (string) Desired storage object name.   
    """
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print("File {} uploaded to {}.".format(source_file_name, destination_blob_name))

In [48]:
def create_table_from_csv(dataset_name, table_name, schema_lst, csv_blob_name):
    '''!@brief Function that create a table in an existing dataset with
    the data contained into a CSV.
    
    @param dataset_name (string) Name of the dataset which will store 
    the table.
    @param table_name (string) Name of the table to be created.
    @param schema_lst (list of tuples) Contains the schema of the table
    to be created. The format must be the next one: 
    [()'column name', 'field format', 'mode', 'Description')]
    Example:
    schema_lst = [('col_A_name',  'STRING', 'REQUIRED', 'Description 1'), 
                  ('col_B_name', 'INTEGER', 'REQUIRED', 'Description 2'),
                  ('col_C_name',   'FLOAT', 'REQUIRED', 'Description 3')]
    For more info:
    https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
    @param csv_blob_name (string) GS URI of the CSV file.
    '''
    
    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Setting table_id to the ID of the table to create.
    table_id = "{}.{}.{}".format(client.project, dataset_name, table_name)
    
    # Creating table schema:
    schema = [bigquery.SchemaField(*tup) for tup in schema_lst]
    
    # Configuring the job which builds the table:
    job_config = bigquery.LoadJobConfig(schema=schema,
                                        skip_leading_rows=1,
                                        source_format=bigquery.SourceFormat.CSV)

    # Making an API request to create the job:
    load_job = client.load_table_from_uri(csv_blob_name, table_id, job_config=job_config)

    # Waiting for the job to be completed.
    load_job.result()

    destination_table = client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))

## 3. Pipeline Functional

### 3.1. Download PDFs to a temporal folder

In [16]:
# Creating the temporal folder if it does not exists:
if not os.path.exists(TEMP_FOLDER):
    # Create folder:
    os.mkdir(TEMP_FOLDER)
    
# Downloading PDFs from the bucket to the temporal folder:
file_lst = download_files_from_bucket(BUCKET, TEMP_FOLDER, source_folder="labeled_patents/pdf/", ext = ".pdf" )

Number of files downloaded: 403


### 3.2. Transforming PDFs into PNGs (Only for Online prediction)

In [17]:
if 'ONLINE'==PREDICTION_MODE:
    # Encoding images as base64:
    imgs = encode_images_in_path(dest_folder)

### 3.3. Cleaning old predictions

In [18]:
filelist = get_bucket_file_list(BUCKET, fname_template='*obj_preds*.jsonl')
clean_bucket(BUCKET, filelist)

### 3.4. Performing predictions in the cloud

In [19]:
# Creating the batch of instances to perform a prediction:
import json
gcs_img_path = f"gs:/{PROJECT}/{BUCKET}/labeled_patents/images"
fp = "obj_inputs.jsonl"
        
# Creating the JSONL file with all the instances:
create_jsonl(gcs_img_path, fp)

# Uploading the JSONL file to a bucket:
!gsutil -m cp ./obj_inputs.jsonl gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents

Copying file://./obj_inputs.jsonl [Content-Type=application/octet-stream]...
/ [1/1 files][ 44.8 KiB/ 44.8 KiB] 100% Done                                    
Operation completed over 1 objects/44.8 KiB.                                     


In [51]:
!echo $OD_ENDPOINT_ID

2074030773706424320


In [52]:
!gcloud ai models list

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
2393478483993952256  text_classification
3409814256151953408  object_detection_patent_figures
8925034949820547072  docprocessing_2021811144149
2880236679656898560  hacker_news_titles_automl
886021654033989632   mnist_20210802_154025
2243012516756062208  babyweight_model_20210730_125424
8763802564723474432  babyweight_model_20210730_124945
5534440156922118144  babyweight_automl_2021728151029
5491655960462098432  pipelines-ModelUpload-20210727125838
657604710433292288   taxifare-20210721144351


In [None]:
# Launching predictions:
create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='3409814256151953408',
    job_display_name='obj_predictions',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/obj_inputs.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/obj_preds')


INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/136021895401/locations/us-central1/batchPredictionJobs/4312761792183926784
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/136021895401/locations/us-central1/batchPredictionJobs/4312761792183926784')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/4312761792183926784?project=136021895401
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/136021895401/locations/us-central1/batchPredictionJobs/4312761792183926784 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/136021895401/locations/us-central1/batchPredictionJobs/4312761792183926784 current state:
JobState.JOB_STAT

### 3.5. Downloding the JSONL files with the predictions

In [54]:
# Downloading the results files from Google Storage:
#gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/obj_preds'
source_folder = 'labeled_patents/obj_preds' 
ext = ".jsonl"
resfile_lst = download_files_from_bucket(BUCKET, TEMP_FOLDER, source_folder, ext)

print('\nDownloaded files:')
print(*resfile_lst, sep='\n')

Number of files downloaded: 5

Downloaded files:
./temp/predictions_00001.jsonl
./temp/predictions_00002.jsonl
./temp/predictions_00003.jsonl
./temp/predictions_00004.jsonl
./temp/predictions_00005.jsonl


### 3.6. Parsing the predictions from the JSONL

In [58]:
# Parsing the JSONL files:
for i, file in enumerate(resfile_lst):
    if i==0:
        res_df = read_imgclass_results_from_jsonl(file)
    else:
        res_df = res_df.append(read_imgclass_results_from_jsonl(file))
        
print('Number of results read: {:d}'.format(res_df.shape[0]))
res_df.head(5)

Number of results read: 403


Unnamed: 0,image_name,label,confidence,x1,y1,x2,y2
0,espacenet_en77.png,figure,0.76443,0.52034,0.551113,0.904439,0.862463
1,us_027.png,figure,0.820037,0.249958,0.676503,0.773016,0.915524
2,computer_vision_12.png,figure,0.863901,0.175164,0.720555,0.748335,0.931796
3,espacenet_en35.png,figure,0.162117,0.319214,0.625916,0.895415,0.902939
4,us_087.png,figure,0.85913,0.324708,0.697885,0.642282,0.945174


In [59]:
# Saving the results dataframe as a CSV file:
res_df.to_csv(os.path.join(TEMP_FOLDER, RESULTS_CSV), index=False)

### 3.7.Upload results to a BQ table

In [61]:
# Uploading the CSV file to a GS bucket:
upload_file_to_bucket(bucket_name=BUCKET, 
                      source_file_name=os.path.join(TEMP_FOLDER, RESULTS_CSV), 
                      destination_blob_name=os.path.join('labeled_patents', 'obj_preds', RESULTS_CSV))

File ./temp/img_class_results.csv uploaded to labeled_patents/obj_preds/img_class_results.csv.


In [70]:
# Storing the CSV content into a BQ table:
dataset_name = 'labeled_patents'
table_name = 'image_classification_results'
schema_lst = [('image_name', 'STRING', 'REQUIRED', 'Name of the image analyzed.'), 
              ('label',      'STRING', 'REQUIRED', 'Predicted class. It can be US or EU'),
              ('confidence',  'FLOAT', 'REQUIRED', 'Confidence of the prediction.')]
csv_blob_name = os.path.join('gs://', BUCKET, 'labeled_patents', 'img_class_preds', RESULTS_CSV)

create_table_from_csv(dataset_name, table_name, schema_lst, csv_blob_name)

Loaded 403 rows.


### 3.8. Cleaning temporal folder

In [None]:
# Deleting the temporal folder:
os.rmdir(TEMP_FOLDER)