In [None]:
#TODO LIST
#1- Change reading files to filter pdfs.
#2- Access to the image classification end point
#3- Create a JSON input for the endpoint (with the image)
#4- Retrieve prediction
#5- Store the prediction in a new BQ table


# Online Prediction

* input: GCS bucket of png files
* output: BQ table with ObjDet preds

1. Create the dataset
1. Create the objdet table
1. List files in bucket
2. Iterate over list and call prediciton
3. parse predicition into dict
4. write row file to BQ

In [1]:
!gsutil ls gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images

gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_10.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_11.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_12.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_13.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_14.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_15.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_16.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_17.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_18.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_19.png
gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/subsample_images/computer_vision_20.png


In [2]:
!gcloud ai models list

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
2393478483993952256  text_classification
3409814256151953408  object_detection_patent_figures
8925034949820547072  docprocessing_2021811144149
2880236679656898560  hacker_news_titles_automl
886021654033989632   mnist_20210802_154025
2243012516756062208  babyweight_model_20210730_125424
8763802564723474432  babyweight_model_20210730_124945
5534440156922118144  babyweight_automl_2021728151029
5491655960462098432  pipelines-ModelUpload-20210727125838
657604710433292288   taxifare-20210721144351


In [3]:
import os
import tempfile

from google.cloud import bigquery
from google.cloud import storage
from fnmatch import fnmatch

import base64

from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

In [4]:
PROJECT='qwiklabs-gcp-00-373ac55d0e0a'
BQ_DATASET='demo_dataset'
OBJDET_TABLE='objdet'

client = bigquery.Client()

# Create dataset
dataset_id = f'{PROJECT}.{BQ_DATASET}'
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

# Create table
OBJDET_SCHEMA = [
    bigquery.SchemaField('file_name', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('objdet_pred', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('objdet_confidence', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('objdet_xmin', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('objdet_xmax', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('objdet_ymin', 'FLOAT', mode='NULLABLE'),
    bigquery.SchemaField('objdet_ymax', 'FLOAT', mode='NULLABLE')]

table_id = f'{PROJECT}.{BQ_DATASET}.{OBJDET_TABLE}'

schema = OBJDET_SCHEMA

table = bigquery.Table(table_id, schema=schema)
table = client.create_table(table)  # Make an API request.
print(
    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)

Conflict: 409 POST https://bigquery.googleapis.com/bigquery/v2/projects/qwiklabs-gcp-00-373ac55d0e0a/datasets?prettyPrint=false: Already Exists: Dataset qwiklabs-gcp-00-373ac55d0e0a:demo_dataset

In [5]:
def get_bucket_file_list(bucket_name, fname_template='*'):
    '''!@brief Function that returns the list of files in a bucket.
    @param bucket (string) Bucket name.
    @param fname_template (string) Template for filtering blob names 
    that supports Unix shell-style wildcards. For more info: 
    https://docs.python.org/3/library/fnmatch.html
            
    @return (list of srtings) List of blob names in a bucket which 
    fullfills template structure.
    '''
    # Instantiating client:
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)
    
    # Listing all the blobs in a bucket:
    blob_lst = [blob.name for blob in blobs]

    # Filtering blob names with the template format given:  
    file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]
    
    return file_lst

In [6]:
def download_files_from_bucket(bucket_name, file_lst, dest_folder):
    '''!@brief Function that downloads a list of files from a bucket.
    @param bucket: (string) Bucket name.
    @param file_lst: (string) List of files to be downloaded.
    @param dest_folder: (string) Folder where files are downloaded.

    @return (list of strings) Names of the downloaded files.
    '''
    # Instantiating client:
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Saving blob into the destination folder:
    new_file_lst = []
    for fname in file_lst:
        # Loading blob:
        blob = bucket.blob(fname)
        # Saving blob into a filename:
        _, name = os.path.split(fname)
        new_fname = os.path.join(dest_folder, name)
        blob.download_to_filename(new_fname)
        new_file_lst.append(new_fname)

    # TODO: A check of the downloaded files should be performed!! Maybe is just 
    # reading the files of the folder since if it is a temporal folder, every time
    # the pipeline is executed, the folder is created empty:
    #os.listdir(dest_folder) or similar
    print('Number of files downloaded: {:d}'.format(len(new_file_lst)))

    return new_file_lst


In [7]:
def predict_image_classification_sample(
    project: str,
    endpoint_id: str,
    filename: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
    with open(filename, "rb") as f:
        file_content = f.read()

    # The format of each instance should conform to the deployed model's prediction input schema.
    encoded_content = base64.b64encode(file_content).decode("utf-8")
    instance = predict.instance.ImageObjectDetectionPredictionInstance(
        content=encoded_content,
    ).to_value()
    instances = [instance]
    parameters = predict.params.ImageObjectDetectionPredictionParams(
        confidence_threshold=0.5, max_predictions=5,
    ).to_value()
    endpoint = client.endpoint_path(
        project=project, location=location, endpoint=endpoint_id
    )
    response = client.predict(
        endpoint=endpoint, instances=instances, parameters=parameters
    )
    predictions = response.predictions
    return [dict(prediction) for prediction in predictions]

'labeled_patents/subsample_images/computer_vision_10.png'

In [23]:
OBJDET_ENDPOINT='2074030773706424320'
TMP_DIR='/home/jupyter/ASL_DocProcessing_2021/Data_Exploration/tmp_png/'

if not os.path.exists(TMP_DIR):
    os.makedirs(TMP_DIR)
    
files = get_bucket_file_list(bucket_name=f'{PROJECT}',
                             fname_template='labeled_patents/subsample_images/*')

for file in files:
    print(f'Processing: {file}.')
    
    # Downloading the file as a temporal file:
    storage_client = storage.Client()
    bucket = storage_client.bucket(PROJECT)
    blob = bucket.blob(file)
    _, path = tempfile.mkstemp()
    blob.download_to_filename(path + '.png')    
    
    # Obtaining online prediction:
    preds = predict_image_classification_sample(
        project=f'{PROJECT}',
        endpoint_id=f'{OBJDET_ENDPOINT}',
        filename=f'{path}.png',
        location='us-central1',
        api_endpoint='us-central1-aiplatform.googleapis.com')
    
    # Parsing prediction:
    objdet_pred = preds[0]['displayNames'][0]
    objdet_confidence = preds[0]['confidences'][0]
    objdet_xmin, objdet_xmax = preds[0]['bboxes'][0][0], preds[0]['bboxes'][0][1]
    objdet_ymin, objdet_ymax = preds[0]['bboxes'][0][2], preds[0]['bboxes'][0][3]
    
    # Storing prediction into the BQ table:
    rows_to_insert = [
        {'file_name': f'{file}'.split('/')[-1],
         'objdet_pred': f'{objdet_pred}',
         'objdet_confidence': f'{objdet_confidence}',
         'objdet_xmin': f'{objdet_xmin}',
         'objdet_xmax': f'{objdet_xmax}',
         'objdet_ymin': f'{objdet_ymin}',
         'objdet_ymax': f'{objdet_ymax}'}
    ]
    
    table_id = f'{PROJECT}.{BQ_DATASET}.{OBJDET_TABLE}'
    
    errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
    if errors == []:
        print("New rows have been added.")
    else:
        print("Encountered errors while inserting rows: {}".format(errors))
    os.remove(f'{path}.png')
    os.remove(path)

Processing: labeled_patents/subsample_images/computer_vision_10.png.
/tmp/tmpif_xr_bo
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_11.png.
/tmp/tmpckvo81nd
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_12.png.
/tmp/tmp6tbcozsj
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_13.png.
/tmp/tmpp5jg32ne
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_14.png.
/tmp/tmpa4o2sjtq
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_15.png.
/tmp/tmp21_c3e22
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_16.png.
/tmp/tmpj04ni2ml
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_17.png.
/tmp/tmpclwj21yr
New rows have been added.
Processing: labeled_patents/subsample_images/computer_vision_18.png.
/tmp/tmpucms44yk
New rows have been

# Predictions Pipeline

## 1. Notebook Configuration


### 1.1. Loading Necessary Libraries

In [None]:
# General libraries:
import os
import io
#import glob
import numpy as np
import pandas as pd

# Dealing with files:
!pip install jsonlines
import jsonlines
import json

# Dealing with images:
#import cv2
#import matplotlib.pyplot as plt

# Google APIs:
from google.cloud import storage
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict

# Libraries for string filtering:
from fnmatch import fnmatch

# Libraries for image encoding
import io
import base64

# Specific PDF libraries:
#!conda install -c conda-forge poppler
!sudo apt-get install -y poppler-data
!sudo apt-get install -y poppler-utils
!pip install pdf2image
from pdf2image import convert_from_path

### 1.2. Setting Notebook Inputs
#### 1.2.1 Google Cloud Settings

In [None]:
!gcloud config list

In [None]:
!gcloud ai-platform models list --region=us-central1
!gcloud ai-platform models list --region=us-west1

In [None]:
PROJECT = 'qwiklabs-gcp-00-373ac55d0e0a'
REGION = 'us-central1'
BUCKET = 'qwiklabs-gcp-00-373ac55d0e0a'

TEMP_FOLDER = './temp'
RESULTS_CSV = 'img_class_results.csv'
PREDICTION_MODE = 'BATCH' # 'ONLINE would be another possibility, but it is not implemented.'


#PDF_FOLDER = os.path.join(TEMP_FOLDER, 'pdf')
#PNG_FOLDER = os.path.join(TEMP_FOLDER, 'png')
#CSV_FOLDER = os.path.join(TEMP_FOLDER, 'csv')

os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET

#### 1.2.2. Image Classification Endpoint

In [None]:
IC_ENDPOINT_ID="7257673944809865216"
IC_PROJECT_ID="136021895401"
IC_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

#### 1.2.3. Object Detection Endpoint

In [None]:
OD_ENDPOINT_ID="2074030773706424320"
OD_PROJECT_ID="136021895401"
OD_INPUT_DATA_FILE="INPUT-JSON"

# Example of instance:
# {
#  "instances": [{
#    "content": "YOUR_IMAGE_BYTES"
#  }],
#   "parameters": {
#     "confidenceThreshold": 0.5,
#     "maxPredictions": 5
#   }
# }

## 2 Auxiliary Functions

In [None]:
def get_bucket_file_list(bucket_name, fname_template='*'):
    '''!@brief Function that returns the list of files in a bucket.
    @param bucket (string) Bucket name.
    @param fname_template (string) Template for filtering blob names 
    that supports Unix shell-style wildcards. For more info: 
    https://docs.python.org/3/library/fnmatch.html
            
    @return (list of srtings) List of blob names in a bucket which 
    fullfills template structure.
    '''
    # Instantiating client:
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)
    
    # Listing all the blobs in a bucket:
    blob_lst = [blob.name for blob in blobs]

    # Filtering blob names with the template format given:  
    file_lst = [fname for fname in blob_lst if fnmatch(fname, fname_template)]
    
    return file_lst

In [None]:
def clean_bucket(bucket_name, filter =['xxsdsdsds']):
    # bucket_name = "your-bucket-name"
    # blob_name = "your-object-name"

    storage_client = storage.Client()
    
    bucket = storage_client.bucket(bucket_name)
    for file in filter:
        blob = bucket.blob(file)
        blob.delete()
        print("Blob {} deleted.".format(file))
        

In [None]:
def download_files_from_bucket(bucket_name, dest_folder, source_folder="labeled_patents/pdf/", ext = ".pdf" ):
    '''@brief! Function that downloads a list of files from a bucket.

    @param bucket: (string) Bucket name.
    @param dest_folder: (string) Folder where files are downloaded.
    '''
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
        
    new_file_lst = []
    # Instantiating client:
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.startswith(source_folder) and blob.name.endswith(ext)]

    # Saving blob into the destination folder:
    for blob in blob_list:
        # Saving blob into a filename:
        _, name = os.path.split(blob.name)
        new_fname = os.path.join(dest_folder, name)
        blob.download_to_filename(new_fname)
        new_file_lst.append(new_fname)
    
    # TODO: A check of the downloaded files should be performed!! Maybe is just 
    # reading the files of the folder since if it is a temporal folder, every time
    # the pipeline is executed, the folder is created empty:
    #os.listdir(dest_folder) or similar
    print('Number of files downloaded: {:d}'.format(len(new_file_lst)))
    
    return new_file_lst

In [None]:
def encode_images_in_path(path):
    '''@brief! Function to encode an image of each pdf to be used as instance 
    for a AutoML mode.
        
    @param file_lst (list of strings) PDF file names to be transformed.
    '''
    file_lst = [os.path.join(path, file) for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
    
    encoded_img_lst = []
    for file in file_lst:
        image = convert_from_path(file)
        image = image[0]                # Only the firs page is going to be analyzed.
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()
        encoded_img_lst.append(base64.b64encode(img_byte_arr).decode("utf-8"))

    return encoded_img_lst

In [None]:
# Functions to create JSONL files for instance creation:
# WATCH OUT!! Hardcoded values!!
def save_jsonl(fp, json_file):
    # needs .jl suffix
    d = json.dumps(json_file)+"\n"
    d = d.encode('utf8')
    try:
        with open(fp, "ab") as f:
            f.write(d)
    except Exception as e:
        print(f"[ERROR]: {e}\n{sys.exc_info()}\n{traceback.format_exc()}")

def create_jsonl(gcs_img_path,fp):
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET)
    # create jsonl
    blob_list  = [blob.name for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/images") and blob.name.endswith(".png")]
    
    for filename in blob_list:
        temp_json = {"content": f"gs://{BUCKET}/{filename}", "mimeType": "image/png"}
        save_jsonl(fp, temp_json)

In [None]:
# Launching batch predictions:
def create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds'):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=True,
    )

    batch_prediction_job.wait()

    print(batch_prediction_job.display_name)
    print(batch_prediction_job.resource_name)
    print(batch_prediction_job.state)
    return batch_prediction_job

In [None]:
def read_imgclass_results_from_jsonl(filename):
    '''!@brief Function that reads the results of image classification prediction
    from the jsonl files created during batch prediction.
    
    @param filename (string) JSONL file path and name
    
    @return (Dataframe) Table with the image classification results.
    '''
    # Creating an empty dataframe to store the image classification results:
    results_df = pd.DataFrame(columns=['image_name', 'label', 'confidence'])

    # Reading the JSONL file and processing each JSON:
    with jsonlines.open(filename, 'r') as file:
        for i, line in enumerate(file):
            # Extracting results from the jsonl file:
            _, image_name = os.path.split(line['instance']['content'])
            pos = np.argmax(line['prediction']['confidences'])
            confidence = line['prediction']['confidences'][pos]
            label = line['prediction']['displayNames'][pos]

            # Storing results into a dataframe:
            results_df.loc[i, 'image_name'] = image_name 
            results_df.loc[i, 'label'] = label
            results_df.loc[i, 'confidence'] = confidence

    return results_df

In [None]:
def upload_file_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """!@brief Function that uploads a file to a bucket.
    
    @param bucket_name (string) ID/name of the bucket.
    @param source_file_name (string) Path to the file to be uploaded.
    @param destination_blob_name (string) Desired storage object name.   
    """
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print("File {} uploaded to {}.".format(source_file_name, destination_blob_name))

In [None]:
def create_table_from_csv(dataset_name, table_name, schema_lst, csv_blob_name):
    '''!@brief Function that create a table in an existing dataset with
    the data contained into a CSV.
    
    @param dataset_name (string) Name of the dataset which will store 
    the table.
    @param table_name (string) Name of the table to be created.
    @param schema_lst (list of tuples) Contains the schema of the table
    to be created. The format must be the next one: 
    [()'column name', 'field format', 'mode', 'Description')]
    Example:
    schema_lst = [('col_A_name',  'STRING', 'REQUIRED', 'Description 1'), 
                  ('col_B_name', 'INTEGER', 'REQUIRED', 'Description 2'),
                  ('col_C_name',   'FLOAT', 'REQUIRED', 'Description 3')]
    For more info:
    https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
    @param csv_blob_name (string) GS URI of the CSV file.
    '''
    
    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Setting table_id to the ID of the table to create.
    table_id = "{}.{}.{}".format(client.project, dataset_name, table_name)
    
    # Creating table schema:
    schema = [bigquery.SchemaField(*tup) for tup in schema_lst]
    
    # Configuring the job which builds the table:
    job_config = bigquery.LoadJobConfig(schema=schema,
                                        skip_leading_rows=1,
                                        source_format=bigquery.SourceFormat.CSV)

    # Making an API request to create the job:
    load_job = client.load_table_from_uri(csv_blob_name, table_id, job_config=job_config)

    # Waiting for the job to be completed.
    load_job.result()

    destination_table = client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))

## 3. Pipeline Functional

### 3.1. Donwload PDFs to a temporal folder

In [None]:
# Creating the temporal folder if it does not exists:
if not os.path.exists(TEMP_FOLDER):
    # Create folder:
    os.mkdir(TEMP_FOLDER)
    
# Downloading PDFs from the bucket to the temporal folder:
file_lst = download_files_from_bucket(BUCKET, TEMP_FOLDER, source_folder="labeled_patents/pdf/", ext = ".pdf" )

### 3.2. Transforming PDFs into PNGs (Only for Online prediction)

In [None]:
if 'ONLINE'==PREDICTION_MODE:
    # Encoding images as base64:
    imgs = encode_images_in_path(dest_folder)

### 3.3. Cleaning old predictions

In [None]:
filelist = get_bucket_file_list(BUCKET, fname_template='*img_class_preds*.jsonl')
clean_bucket(BUCKET, filelist)

### 3.4. Performing predictions in the cloud

In [None]:
# Creating the batch of instances to perform a prediction:
import json
gcs_img_path = f"gs:/{PROJECT}/{BUCKET}/labeled_patents/images"
fp = "images_icn.jsonl"
        
# Creating the JSONL file with all the instances:
create_jsonl(gcs_img_path, fp)

# Uploading the JSONL file to a bucket:
!gsutil -m cp ./images_icn.jsonl gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents

In [None]:
# Launching predictions:
create_batch_prediction_job_sample(
    project='qwiklabs-gcp-00-373ac55d0e0a',
    location='us-central1',
    model_resource_name='8925034949820547072',
    job_display_name='batch_img_classification',
    gcs_source='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/images_icn.jsonl',
    gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds')

In [None]:
!gcloud ai endpoints list --region=us-central1

### 3.5. Downloding the JSONL files with the predictions

In [None]:
# Downloading the results files from Google Storage:
gcs_destination='gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/img_class_preds'
source_folder = 'labeled_patents/img_class_preds' 
ext = ".jsonl"
resfile_lst = download_files_from_bucket(BUCKET, TEMP_FOLDER, source_folder, ext)

print('\nDownloaded files:')
print(*resfile_lst, sep='\n')

### 3.6. Parsing the predictions from the JSONL

In [None]:
# Parsing the JSONL files:
for i, file in enumerate(resfile_lst):
    if i==0:
        res_df = read_imgclass_results_from_jsonl(file)
    else:
        res_df = res_df.append(read_imgclass_results_from_jsonl(file))
        
print('Number of results read: {:d}'.format(res_df.shape[0]))
res_df.head(5)

In [None]:
# Saving the results dataframe as a CSV file:
res_df.to_csv(os.path.join(TEMP_FOLDER, RESULTS_CSV), index=False)

### 3.7.Upload results to a BQ table

In [None]:
# Uploading the CSV file to a GS bucket:
upload_file_to_bucket(bucket_name=BUCKET, 
                      source_file_name=os.path.join(TEMP_FOLDER, RESULTS_CSV), 
                      destination_blob_name=os.path.join('labeled_patents', 'img_class_preds', RESULTS_CSV))

In [None]:
# Storing the CSV content into a BQ table:
dataset_name = 'labeled_patents'
table_name = 'image_classification_results'
schema_lst = [('image_name', 'STRING', 'REQUIRED', 'Name of the image analyzed.'), 
              ('label',      'STRING', 'REQUIRED', 'Predicted class. It can be US or EU'),
              ('confidence',  'FLOAT', 'REQUIRED', 'Confidence of the prediction.')]
csv_blob_name = os.path.join('gs://', BUCKET, 'labeled_patents', 'img_class_preds', RESULTS_CSV)

create_table_from_csv(dataset_name, table_name, schema_lst, csv_blob_name)

### 3.8. Cleaning temporal folder

In [None]:
# Deleting the temporal folder:
os.rmdir(TEMP_FOLDER)