In [None]:
#TODO LIST
#1- Change reading files to filter pdfs.
#2- Access to the image classification end point
#3- Create a JSON input for the endpoint (with the image)
#4- Retrieve prediction
#5- Store the prediction in a new BQ table


# Predictions Pipeline

## 1. Notebook Configuration


### 1.1. Loading Necessary Libraries

In [20]:
# General libraries:
import os
import io
#import glob
#import numpy as np
#import pandas as pd

# Dealing with images:
#import cv2
#import matplotlib.pyplot as plt

# Google APIs:
from google.cloud import storage
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# Libraries for string filtering:
from fnmatch import fnmatch

# Libraries for image encoding
import io
import base64

# Specific PDF libraries:
#!conda install -c conda-forge poppler
# !sudo apt-get install -y poppler-data
# !sudo apt-get install -y poppler-utils
# !pip install pdf2image
from pdf2image import convert_from_path

### 1.2. Setting Notebook Inputs
#### 1.2.1 Google Cloud Settings

In [4]:
!gcloud config list

[ai]
region = us-central1
[compute]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


In [5]:
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'
BUCKET = 'qwiklabs-gcp-00-373ac55d0e0a'

TEMP_FOLDER = './temp'
#PDF_FOLDER = os.path.join(TEMP_FOLDER, 'pdf')
#PNG_FOLDER = os.path.join(TEMP_FOLDER, 'png')
#CSV_FOLDER = os.path.join(TEMP_FOLDER, 'csv')

os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET

#### 1.2.2. Image Classification Endpoint

In [4]:
IC_ENDPOINT_ID="7257673944809865216"
IC_PROJECT_ID="136021895401"
IC_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

#### 1.2.3. Object Detection Endpoint

In [5]:
OD_ENDPOINT_ID="2074030773706424320"
OD_PROJECT_ID="136021895401"
OD_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

## 2 Auxiliary Functions

In [41]:
def get_bucket_file_list(bucket_name, fname_template='*'):
    '''@brief! Function that returns the list of files in a bucket.

    @param: bucket: (string) Bucket name.
    @param: fname_template: (string) Template for filtering blob names that
            supports Unix shell-style wildcards. For more info: 
            https://docs.python.org/3/library/fnmatch.html
    '''
    # Instantiating client:
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)
    
    # Listing all the blobs in a bucket:
    blob_lst = [blob.name for blob in blobs]

    # Filtering blob names with the template format given:  
    file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]

    return file_lst 

# Creates the new bucket
#bucket = storage_client.create_bucket(bucket_name)
#dir(blob)
#blob.name
#blob.bucket.name
#blob.download_to_file()
#blob.download_to_filename()

In [90]:
def download_files_from_bucket(bucket_name, file_lst, dest_folder):
    '''@brief! Function that downloads a list of files from a bucket.

    @param: bucket: (string) Bucket name.
    @param: file_lst: (string) List of files to be downloaded.
    @param: dest_folder: (string) Folder where files are downloaded.
    '''
    # Instantiating client:
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Saving blob into the destination folder:
    for fname in file_lst:
        # Loading blob:
        blob = bucket.blob(fname)
        # Saving blob into a filename:
        _, name = os.path.split(fname)
        new_fname = os.path.join(dest_folder, name)
        blob.download_to_filename(new_fname)
        new_file_lst.append(new_fname)
    
    # TODO: A check of the downloaded files should be performed!! Maybe is just 
    # reading the files of the folder since if it is a temporal folder, every time
    # the pipeline is executed, the folder is created empty:
    #os.listdir(dest_folder) or similar
    print('Number of files downloaded: {:d}'.format(len(new_file_lst)))
    
    return new_file_lst

In [103]:
def encode_image_lst(file_lst):
    '''@brief! Function to encode an image of each pdf to be used as instance 
    for a AutoML mode.
        
    @param: file_lst (list of strings) PDF file names to be transformed.
    '''
    encoded_img_lst = []
    for file in file_lst:
        image = convert_from_path(file)
        image = image[0]                # Only the firs page is going to be analyzed.
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()
        encoded_img_lst.append(base64.b64encode(img_byte_arr).decode("utf-8"))

    return encoded_img_lst

### 2. Pipeline Functional

In [83]:
# Retrieving all pdf file names stored in BUCKET/pdf:
file_lst = get_bucket_file_list(BUCKET, '*/pdf/*.pdf')

In [84]:
# Download the pdfs into a temporal folder:
new_file_lst = download_files_from_bucket(BUCKET, file_lst, TEMP_FOLDER)

Number of files downloaded: 403


In [102]:
# Transforming pdfs into images and encoding them:
new_file_lst = ['./temp/computer_vision_1.pdf']
encoded_img_lst = encode_image_lst(new_file_lst)

In [None]:
# Creating instance:


In [None]:
#Launch prediction


In [None]:
# Postprocess the results

## Getting online predictions from AutoML models

For some data types, you can request online (real-time) predictions from AutoML models after you create and deploy them to an endpoint. An online prediction is a synchronous request as opposed to a batch prediction, which is an asynchronous request.

In [1]:
import os

In [16]:
# Dealing with "Lack of permission" problem
PWD = os.path.abspath(os.path.curdir)
SERVICE_ACCOUNT='sa-objectdetection.json'
SERVICE_KEY_PATH = os.path.join(PWD, SERVICE_ACCOUNT)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_KEY_PATH

In [105]:
def pdf2base64(pdf_path):
    convert_from_path(pdf_path)
    image = image[0]
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format='PNG')
    encoded_content = base64.b64encode(img_byte_arr).decode("utf-8")
    
    return encoded_content
    

filename = './temp/computer_vision_1.png'
project = PROJECT
endpoint_id = OD_ENDPOINT_ID
location = REGION
api_endpoint = f'{REGION}-aiplatform.googleapis.com'

# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": api_endpoint}

# Initialize client that will be used to create and send requests.
client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

# Creating model instance:
#The format of each instance should conform to the deployed model's prediction input schema.
encoded_content = pdf2base64('./temp/computer_vision_1.pdf')
instance = predict.instance.ImageClassificationPredictionInstance(content=encoded_content,).to_value()
instances = [instance]


parameters = predict.params.ImageClassificationPredictionParams(confidence_threshold=0.5, 
                                                                    max_predictions=5,).to_value()

endpoint = client.endpoint_path(project=project, 
                                location=location, 
                                endpoint=endpoint_id)

response = client.predict(endpoint=endpoint, 
                          instances=instances, 
                          parameters=parameters)

UnboundLocalError: local variable 'image' referenced before assignment

In [None]:
project = PROJECT
endpoint_id = IM_ENDPOINT_ID
filename = './emp/computer_vision_1.png'
location = REGION
api_endpoint = f'{REGION}-aiplatform.googleapis.com'


def predict_image_classification_sample(
    project: str,
    endpoint_id: str,
    filename: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
    with open(filename, "rb") as f:
        file_content = f.read()

    # The format of each instance should conform to the deployed model's prediction input schema.
    encoded_content = base64.b64encode(file_content).decode("utf-8")
    instance = predict.instance.ImageClassificationPredictionInstance(content=encoded_content,).to_value()
    instances = [instance]
    
    # See gs://google-cloud-aiplatform/schema/predict/params/image_classification_1.0.0.yaml for the format of the parameters.
    parameters = predict.params.ImageClassificationPredictionParams(confidence_threshold=0.5, 
                                                                    max_predictions=5,).to_value()
    endpoint = client.endpoint_path(project=project, 
                                    location=location, 
                                    endpoint=endpoint_id)
    
    response = client.predict(endpoint=endpoint, 
                              instances=instances, 
                              parameters=parameters)
    
    print("response")
    print(" deployed_model_id:", response.deployed_model_id)
    # See gs://google-cloud-aiplatform/schema/predict/prediction/classification.yaml for the format of the predictions.
    predictions = response.predictions
    for prediction in predictions:
        print(" prediction:", dict(prediction))


In [None]:

# EXAMPLE NOT TESTED - DONT RUN
input_path = 'qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/computer_vision_1.pdf'
input_bucket_name = input_path.replace('gs://', '').split('/')[0]
input_bucket_name

  automl_client = automl.AutoMlClient.from_service_account_json(service_acct)
  model_full_id = automl_client.model_path(
      main_project_id,
      compute_region,
      model_id)
  prediction_client = automl.PredictionServiceClient.from_service_account_json(service_acct)

  # Create other clients
  storage_client = storage.Client.from_service_account_json(service_acct) 
  bq_client = bigquery.Client.from_service_account_json(service_acct)

In [None]:
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


ENDPOINT_RESOURCENAME = OD_ENDPOINT_ID
os.environ["ENDPOINT_RESOURCENAME"] = ENDPOINT_RESOURCENAME


# The AI Platform services require regional API endpoints.
api_endpoint = f'{REGION}-aiplatform.googleapis.com'
client_options = {"api_endpoint": api_endpoint}

# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)


#instances = [json_format.ParseDict(instance, Value()) for instance in instances]
response = client.predict(endpoint=ENDPOINT_RESOURCENAME, instances=instances)

# The predictions are a google.protobuf.Value representation of the model's predictions.
print(" prediction:", response.predictions)



Mike's hacking

In [29]:
import base64

from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict


def predict_image_classification_sample(
    project: str,
    endpoint_id: str,
    filename: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
    with open(filename, "rb") as f:
        file_content = f.read()

    # The format of each instance should conform to the deployed model's prediction input schema.
    encoded_content = base64.b64encode(file_content).decode("utf-8")
    instance = predict.instance.ImageClassificationPredictionInstance(
        content=encoded_content,
    ).to_value()
    instances = [instance]
    # See gs://google-cloud-aiplatform/schema/predict/params/image_classification_1.0.0.yaml for the format of the parameters.
    parameters = predict.params.ImageClassificationPredictionParams(
        confidence_threshold=0.5, max_predictions=5,
    ).to_value()
    endpoint = client.endpoint_path(
        project=project, location=location, endpoint=endpoint_id
    )
    response = client.predict(
        endpoint=endpoint, instances=instances, parameters=parameters
    )
    print("response")
    print(" deployed_model_id:", response.deployed_model_id)
    # See gs://google-cloud-aiplatform/schema/predict/prediction/classification.yaml for the format of the predictions.
    predictions = response.predictions
    for prediction in predictions:
        print(" prediction:", dict(prediction))
    return dict(prediction)

In [9]:
#ls temp

In [10]:
%%bash
echo $GOOGLE_APPLICATION_CREDENTIALS




In [30]:
response = predict_image_classification_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    endpoint_id='7257673944809865216',
    filename='./temp/computer_vision_1.png',
    location='us-central1',
    api_endpoint='us-central1-aiplatform.googleapis.com',
)

response
 deployed_model_id: 7096274433945370624
 prediction: {'ids': ['1754500600046813184'], 'displayNames': ['US'], 'confidences': [0.992076516]}


In [35]:
icn_class, icn_confidence = response['displayNames'][0], response['confidences'][0]

In [None]:
def 
# write to .csv 
pdf_name, icn_class, icn_confidence

In [5]:
!gcloud config set ai/region us-west1

Updated property [ai/region].


In [6]:
!gcloud config list

[ai]
region = us-west1
[compute]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


In [None]:
# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
with open(filename, "rb") as f:
    file_content = f.read()

Batch Prediction

In [121]:
!gsutil cp img_class_test.jsonl gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_test.jsonl

Copying file://img_class_test.jsonl [Content-Type=application/octet-stream]...
/ [1 files][  118.0 B/  118.0 B]                                                
Operation completed over 1 objects/118.0 B.                                      


In [18]:
def create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_test.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds',
):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=sync,
    )

    batch_prediction_job.wait()

    print(batch_prediction_job.display_name)
    print(batch_prediction_job.resource_name)
    print(batch_prediction_job.state)
    return batch_prediction_job

In [19]:
create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_test.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds',
)

PermissionDenied: 403 Permission 'aiplatform.models.get' denied on resource '//aiplatform.googleapis.com/projects/qwiklabs-gcp-00-373ac55d0e0a/locations/us-central1/models/8925034949820547072' (or it may not exist).

In [10]:
!gcloud ai models list

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
3409814256151953408  object_detection_patent_figures
8925034949820547072  docprocessing_2021811144149
2880236679656898560  hacker_news_titles_automl
886021654033989632   mnist_20210802_154025
2243012516756062208  babyweight_model_20210730_125424
8763802564723474432  babyweight_model_20210730_124945
5534440156922118144  babyweight_automl_2021728151029
5491655960462098432  pipelines-ModelUpload-20210727125838
657604710433292288   taxifare-20210721144351


In [11]:
!gcloud config list

[ai]
region = us-central1
[compute]
region = us-central1
[core]
account = 136021895401-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-373ac55d0e0a

Your active configuration is: [default]


In [14]:
#!gsutil ls gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images/

In [17]:
def get_bucket_file_list(bucket_name, fname_template='*'):
    '''@brief! Function that returns the list of files in a bucket.

    @param: bucket: (string) Bucket name.
    @param: fname_template: (string) Template for filtering blob names that
            supports Unix shell-style wildcards. For more info: 
            https://docs.python.org/3/library/fnmatch.html
    '''
    # Instantiating client:
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)
    
    # Listing all the blobs in a bucket:
    blob_lst = [blob.name for blob in blobs]

    # Filtering blob names with the template format given:  
    file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]

    return file_lst 

In [23]:
files = get_bucket_file_list('qwiklabs-gcp-00-373ac55d0e0a' /labeled_patents')

NotFound: 404 GET https://storage.googleapis.com/storage/v1/b/qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/o?projection=noAcl&prettyPrint=false: Not Found

In [24]:
files

['babyweight/babyweight_trainer-0.1.tar.gz',
 'babyweight/batchpred/inputs.jsonl',
 'babyweight/data/eval000000000000.csv',
 'babyweight/data/eval000000000001.csv',
 'babyweight/data/train000000000000.csv',
 'babyweight/data/train000000000001.csv',
 'babyweight/data/train000000000002.csv',
 'babyweight/data/train000000000003.csv',
 'babyweight/data/train000000000004.csv',
 'babyweight/data/train000000000005.csv',
 'babyweight/trained_model_20210730_091437/',
 'babyweight/trained_model_20210730_091437/20210730093154/',
 'babyweight/trained_model_20210730_091437/20210730093154/assets/',
 'babyweight/trained_model_20210730_091437/20210730093154/saved_model.pb',
 'babyweight/trained_model_20210730_091437/20210730093154/variables/',
 'babyweight/trained_model_20210730_091437/20210730093154/variables/variables.data-00000-of-00001',
 'babyweight/trained_model_20210730_091437/20210730093154/variables/variables.index',
 'babyweight/trained_model_20210730_091437/checkpoints/',
 'babyweight/train

In [None]:
ICN_BATCH_INPUTS = './icn_batch_inputs.jsonl'

for idx, text in sample_title_dataset.title.items():
    # write the text sample to GCS
    blob = bucket.blob(f'hacker_news_sample/sample_{idx}.txt')
    blob.upload_from_string(
            data=text,
            content_type='text/plain'
            )  
    
    # add the GCS file to local jsonl
    with open(SAMPLE_BATCH_INPUTS, "a") as f:
        f.write(f'{{"content\": \"gs://{BUCKET}/hacker_news_sample/sample_{idx}.txt\", \"mimeType\": \"text/plain\"}}\n')