In [1]:
#TODO LIST
#1- Change reading files to filter pdfs.
#2- Access to the image classification end point
#3- Create a JSON input for the endpoint (with the image)
#4- Retrieve prediction
#5- Store the prediction in a new BQ table


# Predictions Pipeline

## 1. Notebook Configuration


### 1.1. Loading Necessary Libraries

In [60]:
# General libraries:
import os
import io
#import glob
import numpy as np
import pandas as pd

# Dealing with files:
!pip install jsonlines
import jsonlines
import json

# Dealing with images:
#import cv2
#import matplotlib.pyplot as plt

# Google APIs:
from google.cloud import storage
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# Libraries for string filtering:
from fnmatch import fnmatch

# Libraries for image encoding
import io
import base64

# Specific PDF libraries:
#!conda install -c conda-forge poppler
!sudo apt-get install -y poppler-data
!sudo apt-get install -y poppler-utils
!pip install pdf2image
from pdf2image import convert_from_path

Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-data is already the newest version (0.4.9-2).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-utils is already the newest version (0.71.0-5).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


### 1.2. Setting Notebook Inputs
#### 1.2.1 Google Cloud Settings

In [3]:
!gcloud config list

[ai]
region = us-west1
[compute]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


In [52]:
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'
BUCKET = 'qwiklabs-gcp-00-373ac55d0e0a'

TEMP_FOLDER = './temp'
RESULTS_CSV = 'img_class_results.csv'
PREDICTION_MODE = 'BATCH' # 'ONLINE would be another possibility, but it is not implemented.'


#PDF_FOLDER = os.path.join(TEMP_FOLDER, 'pdf')
#PNG_FOLDER = os.path.join(TEMP_FOLDER, 'png')
#CSV_FOLDER = os.path.join(TEMP_FOLDER, 'csv')

os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET

#### 1.2.2. Image Classification Endpoint

In [5]:
IC_ENDPOINT_ID="7257673944809865216"
IC_PROJECT_ID="136021895401"
IC_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

#### 1.2.3. Object Detection Endpoint

In [6]:
OD_ENDPOINT_ID="2074030773706424320"
OD_PROJECT_ID="136021895401"
OD_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

## 2 Auxiliary Functions

In [7]:
def get_bucket_file_list(bucket_name, fname_template='*'):
    '''!@brief Function that returns the list of files in a bucket.
    @param bucket (string) Bucket name.
    @param fname_template (string) Template for filtering blob names 
    that supports Unix shell-style wildcards. For more info: 
    https://docs.python.org/3/library/fnmatch.html
            
    @return (list of srtings) List of blob names in a bucket which 
    fullfills template structure.
    '''
    # Instantiating client:
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)
    
    # Listing all the blobs in a bucket:
    blob_lst = [blob.name for blob in blobs]

    # Filtering blob names with the template format given:  
    file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]
    
    return file_lst

In [8]:
def clean_bucket(bucket_name, filter =['xxsdsdsds']):
    # bucket_name = "your-bucket-name"
    # blob_name = "your-object-name"

    storage_client = storage.Client()
    
    bucket = storage_client.bucket(bucket_name)
    for file in filter:
        blob = bucket.blob(file)
        blob.delete()
        print("Blob {} deleted.".format(file))
        

In [20]:
def download_files_from_bucket(bucket_name, dest_folder, source_folder="labeled_patents/pdf/", ext = ".pdf" ):
    '''@brief! Function that downloads a list of files from a bucket.

    @param bucket: (string) Bucket name.
    @param dest_folder: (string) Folder where files are downloaded.
    '''
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
        
    new_file_lst = []
    # Instantiating client:
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.startswith(source_folder) and blob.name.endswith(ext)]

    # Saving blob into the destination folder:
    for blob in blob_list:
        # Saving blob into a filename:
        _, name = os.path.split(blob.name)
        new_fname = os.path.join(dest_folder, name)
        blob.download_to_filename(new_fname)
        new_file_lst.append(new_fname)
    
    # TODO: A check of the downloaded files should be performed!! Maybe is just 
    # reading the files of the folder since if it is a temporal folder, every time
    # the pipeline is executed, the folder is created empty:
    #os.listdir(dest_folder) or similar
    print('Number of files downloaded: {:d}'.format(len(new_file_lst)))
    
    return new_file_lst

In [10]:
def encode_images_in_path(path):
    '''@brief! Function to encode an image of each pdf to be used as instance 
    for a AutoML mode.
        
    @param file_lst (list of strings) PDF file names to be transformed.
    '''
    file_lst = [os.path.join(path, file) for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
    
    encoded_img_lst = []
    for file in file_lst:
        image = convert_from_path(file)
        image = image[0]                # Only the firs page is going to be analyzed.
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()
        encoded_img_lst.append(base64.b64encode(img_byte_arr).decode("utf-8"))

    return encoded_img_lst

In [16]:
# Functions to create JSONL files for instance creation:
# WATCH OUT!! Hardcoded values!!
def save_jsonl(fp, json_file):
    # needs .jl suffix
    d = json.dumps(json_file)+"\n"
    d = d.encode('utf8')
    try:
        with open(fp, "ab") as f:
            f.write(d)
    except Exception as e:
        print(f"[ERROR]: {e}\n{sys.exc_info()}\n{traceback.format_exc()}")

def create_jsonl(gcs_img_path,fp):
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET)
    # create jsonl
    blob_list  = [blob.name for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/images") and blob.name.endswith(".png")]
    
    for filename in blob_list:
        temp_json = {"content": f"gs://{BUCKET}/{filename}", "mimeType": "image/png"}
        save_jsonl(fp, temp_json)

In [15]:
# Launching batch predictions:
def create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds'):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=True,
    )

    batch_prediction_job.wait()

    print(batch_prediction_job.display_name)
    print(batch_prediction_job.resource_name)
    print(batch_prediction_job.state)
    return batch_prediction_job

In [49]:
def read_imgclass_results_from_jsonl(filename):
    '''!@brief Function that reads the results of image classification prediction
    from the jsonl files created during batch prediction.
    
    @param filename (string) JSONL file path and name
    
    @return (Dataframe) Table with the image classification results.
    '''
    # Creating an empty dataframe to store the image classification results:
    results_df = pd.DataFrame(columns=['image_name', 'label', 'confidence'])

    # Reading the JSONL file and processing each JSON:
    with jsonlines.open(filename, 'r') as file:
        for i, line in enumerate(file):
            # Extracting results from the jsonl file:
            _, image_name = os.path.split(line['instance']['content'])
            pos = np.argmax(line['prediction']['confidences'])
            confidence = line['prediction']['confidences'][pos]
            label = line['prediction']['displayNames'][pos]

            # Storing results into a dataframe:
            results_df.loc[i, 'image_name'] = image_name 
            results_df.loc[i, 'label'] = label
            results_df.loc[i, 'confidence'] = confidence

    return results_df

In [59]:
def upload_file_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """!@brief Function that uploads a file to a bucket.
    
    @param bucket_name (string) ID/name of the bucket.
    @param source_file_name (string) Path to the file to be uploaded.
    @param destination_blob_name (string) Desired storage object name.   
    """
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print("File {} uploaded to {}.".format(source_file_name, destination_blob_name))

In [65]:
def create_table_from_csv(dataset_name, table_name, schema_lst, csv_blob_name):
    '''!@brief Function that create a table in an existing dataset with
    the data contained into a CSV.
    
    @param dataset_name (string) Name of the dataset which will store 
    the table.
    @param table_name (string) Name of the table to be created.
    @param schema_lst (list of tuples) Contains the schema of the table
    to be created. The format must be the next one: 
    [()'column name', 'field format', 'mode', 'Description')]
    Example:
    schema_lst = [('col_A_name',  'STRING', 'REQUIRED', 'Description 1'), 
                  ('col_B_name', 'INTEGER', 'REQUIRED', 'Description 2'),
                  ('col_C_name',   'FLOAT', 'REQUIRED', 'Description 3')]
    For more info:
    https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
    @param csv_blob_name (string) GS URI of the CSV file.
    '''
    
    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Setting table_id to the ID of the table to create.
    table_id = "{}.{}.{}".format(client.project, dataset_name, table_name)
    
    # Creating table schema:
    schema = [bigquery.SchemaField(*tup) for tup in schema_lst]
    
    # Configuring the job which builds the table:
    job_config = bigquery.LoadJobConfig(schema=schema,
                                        skip_leading_rows=1,
                                        source_format=bigquery.SourceFormat.CSV)

    # Making an API request to create the job:
    load_job = client.load_table_from_uri(csv_blob_name, table_id, job_config=job_config)

    # Waiting for the job to be completed.
    load_job.result()

    destination_table = client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))

## 3. Pipeline Functional

### 3.1. Donwload PDFs to a temporal folder

In [12]:
# Creating the temporal folder if it does not exists:
if not os.path.exists(TEMP_FOLDER):
    # Create folder:
    os.mkdir(TEMP_FOLDER)
    
# Downloading PDFs from the bucket to the temporal folder:
file_lst = download_files_from_bucket(BUCKET, TEMP_FOLDER, source_folder="labeled_patents/pdf/", ext = ".pdf" )

Number of files downloaded: 403


### 3.2. Transforming PDFs into PNGs (Only for Online prediction)

In [11]:
if 'ONLINE'==PREDICTION_MODE:
    # Encoding images as base64:
    imgs = encode_images_in_path(dest_folder)

### 3.3. Cleaning old predictions

In [13]:
filelist = get_bucket_file_list(BUCKET, fname_template='*img_class_preds*.jsonl')
clean_bucket(BUCKET, filelist)

Blob labeled_patents/img_class_preds/prediction-docprocessing_2021811144149-2021-08-16T14:36:01.583226Z/predictions_00001.jsonl deleted.
Blob labeled_patents/img_class_preds/prediction-docprocessing_2021811144149-2021-08-16T14:36:01.583226Z/predictions_00002.jsonl deleted.
Blob labeled_patents/img_class_preds/prediction-docprocessing_2021811144149-2021-08-16T14:36:01.583226Z/predictions_00003.jsonl deleted.
Blob labeled_patents/img_class_preds/prediction-docprocessing_2021811144149-2021-08-16T14:36:01.583226Z/predictions_00004.jsonl deleted.
Blob labeled_patents/img_class_preds/prediction-docprocessing_2021811144149-2021-08-16T14:36:01.583226Z/predictions_00005.jsonl deleted.


### 3.4. Performing predictions in the cloud

In [14]:
# Creating the batch of instances to perform a prediction:
import json
gcs_img_path = f"gs:/{PROJECT}/{BUCKET}/labeled_patents/images"
fp = "images_icn.jsonl"
        
# Creating the JSONL file with all the instances:
create_jsonl(gcs_img_path, fp)

# Uploading the JSONL file to a bucket:
!gsutil -m cp ./images_icn.jsonl gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents

Copying file://./images_icn.jsonl [Content-Type=application/octet-stream]...
/ [1/1 files][ 44.8 KiB/ 44.8 KiB] 100% Done                                    
Operation completed over 1 objects/44.8 KiB.                                     


In [17]:
# Launching predictions:
create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds')

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/136021895401/locations/us-central1/batchPredictionJobs/1856611145406742528
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/136021895401/locations/us-central1/batchPredictionJobs/1856611145406742528')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/1856611145406742528?project=136021895401
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/136021895401/locations/us-central1/batchPredictionJobs/1856611145406742528 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/136021895401/locations/us-central1/batchPredictionJobs/1856611145406742528 current state:
JobState.JOB_STAT

<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f5577bfad50> 
resource name: projects/136021895401/locations/us-central1/batchPredictionJobs/1856611145406742528

### 3.5. Downloding the JSONL files with the predictions

In [24]:
# Downloading the results files from Google Storage:
gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds'
source_folder = 'labeled_patents/img_class_preds' 
ext = ".jsonl"
resfile_lst = download_files_from_bucket(BUCKET, TEMP_FOLDER, source_folder, ext)

print('\nDownloaded files:')
print(*resfile_lst, sep='\n')

Number of files downloaded: 5

Downloaded files:
./temp/predictions_00001.jsonl
./temp/predictions_00002.jsonl
./temp/predictions_00003.jsonl
./temp/predictions_00004.jsonl
./temp/predictions_00005.jsonl


### 3.6. Parsing the predictions from the JSONL

In [51]:
# Parsing the JSONL files:
for i, file in enumerate(resfile_lst):
    if i==0:
        res_df = read_imgclass_results_from_jsonl(file)
    else:
        res_df = res_df.append(read_imgclass_results_from_jsonl(file))
        
print('Number of results read: {:d}'.format(res_df.shape[0]))
res_df.head(5)

Number of results read: 403


Unnamed: 0,image_name,label,confidence
0,computer_vision_17.png,US,0.712984
1,espacenet_en64.png,EU,0.999885
2,espacenet_de68.png,EU,0.999912
3,espacenet_de49.png,EU,0.999843
4,crypto_13.png,US,0.999836


In [68]:
# Saving the results dataframe as a CSV file:
res_df.to_csv(os.path.join(TEMP_FOLDER, RESULTS_CSV), index=False)

### 3.7.Upload results to a BQ table

In [69]:
# Uploading the CSV file to a GS bucket:
upload_file_to_bucket(bucket_name=BUCKET, 
                      source_file_name=os.path.join(TEMP_FOLDER, RESULTS_CSV), 
                      destination_blob_name=os.path.join('labeled_patents', 'img_class_preds', RESULTS_CSV))

File ./temp/img_class_results.csv uploaded to labeled_patents/img_class_preds/img_class_results.csv.


In [70]:
# Storing the CSV content into a BQ table:
dataset_name = 'labeled_patents'
table_name = 'image_classification_results'
schema_lst = [('image_name', 'STRING', 'REQUIRED', 'Name of the image analyzed.'), 
              ('label',      'STRING', 'REQUIRED', 'Predicted class. It can be US or EU'),
              ('confidence',  'FLOAT', 'REQUIRED', 'Confidence of the prediction.')]
csv_blob_name = os.path.join('gs://', BUCKET, 'labeled_patents', 'img_class_preds', RESULTS_CSV)

create_table_from_csv(dataset_name, table_name, schema_lst, csv_blob_name)

Loaded 403 rows.


### 3.8. Cleaning temporal folder

In [None]:
# Deleting the temporal folder:
os.rmdir(TEMP_FOLDER)