# Prepare pdfs for later in pipeline (Obj Det, Img, text, NER)

- user provides
    - Google Cloud project (input)
    - bucket in GCS of pdfs (input)
    - BQ dataset to write prediction results (output)
        - BQ table: aggregated results (pdf_name, icn_pred, objdet_pred(coords), text_cn, ner1, ner2, ...., ner)
            created with JOIN on pdf_name
        - BQ table: icn_preds (pdf_name, icn_pred)    --> this table is made in icn_predict.ipynb
        - BQ table: objdet_pred (pdf_name, objdet_pred(coords)) --> this table is made in objdet_predict.ipynb
        - BQ table: text_cn (pdf_name, text_cn)    --> this table is made in text_cn_predict.ipynb
        - BQ table: ner (pdf_name, ner1, ner2, ...., ner)
        
- see utils.py for utils functions
        

Steps: 
 1. convert pdf to png and write to bucket (for ICN, ObjDet)
 2. do ocr on pdf and write to bucket 
 3. create dataset 
    

In [20]:
PROJECT = !gcloud config get-value project # returns SList
PROJECT = PROJECT[0] # gets first element in list -> str
REGION = "us-central1"  
MODEL_RESOURCE_NAME = "2393478483993952256"

import os
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION

In [21]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

In [22]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [96]:
from google.cloud import storage
from google.cloud import vision
import tempfile
import traceback as tb
from importlib import reload

# for jupyter only
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [77]:
from pdf2image import convert_from_path
import io
import base64
import cv2


In [31]:
logging.info("test if logging works")

12:29:22 INFO:test if logging works


In [105]:
def to_trace_str(e):
    return ''.join(tb.format_exception(None, e, e.__traceback__))

class Utils():
    def __init__(self):
        self.storage_client = storage.Client()
        
    def __dismantle_path(self, gcs_path):
        directory = os.path.splitext(gcs_path)[1]
        bucket_name =  os.path.splitext(gcs_path)[0].replace("gs://","")
        
        return bucket_name, directory
        
    
    def convert_pdf_to_png(self, src_path, dst_path):
        """Takes pdfs from src_bucket_name and transforms them into png. Then it saves the result in dst_bucket_name"""
        try:
            logging.info("started conversion pdf -> png")
        
            src_bucket_name, src_directory = self.__dismantle_path(src_path)
            dst_bucket_name, dst_directory = self.__dismantle_path(dst_path)
            
            src_bucket = self.storage_client.bucket(src_bucket_name)
            dst_bucket = self.storage_client.bucket(dst_bucket_name)

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]

            encoded_img_lst = []
            imgs = []
            logging.info(f"found {len(blob_list)} pdfs in bucket  {src_bucket_name}")

            for b_idx, blob in enumerate(blob_list):
                _, tmp_pdf = tempfile.mkstemp()
                blob.download_to_filename(tmp_pdf)
                logging.info(f"downloaded {b_idx+1} of {len(blob_list)} files")
                image = convert_from_path(tmp_pdf)
                logging.info(f"converted {b_idx+1} of {len(blob_list)} images")
                image = image[0]                # Only the firs page is going to be analyzed.
                image = np.array(image)
                is_success, im_buf_arr = cv2.imencode(".png", image)
                byte_im = im_buf_arr.tobytes()
                filename = os.path.join(dst_directory, blob.name+".png")
                dst_bucket.blob(filename).upload_from_string(byte_im)
                logging.info(f"saved {b_idx+1} of {len(blob_list)} images with filename {filename}")
        
        except Exception as e:
            logging.error(f"Error in method convert_pdf_to_png: {to_trace_str(e)}")
            return False
    
    def ocr(self, src_path, dst_path):
        """Perform optical character recognition in pdf files.
        
        Args
            src_path
            dst_path
        
        Returns
            google.api_core.operation.Operation
            To check if done use method .done()
            
        Link to documentation:  
            https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
            https://cloud.google.com/vision/docs/pdf
        
        """
        try:
            logging.info("started optical character recognition")
        
            src_bucket_name, src_directory = self.__dismantle_path(src_path)
            dst_bucket_name, dst_directory = self.__dismantle_path(dst_path)
            
            src_bucket = self.storage_client.bucket(src_bucket_name)
            dst_bucket = self.storage_client.bucket(dst_bucket_name)

            blob_list = [blob for blob in list(src_bucket.list_blobs()) if \
                         os.path.basename(src_directory) in blob.name and \
                         blob.name.endswith(".pdf")]

            client = vision.ImageAnnotatorClient()
            feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
            
            operations = []
            async_requests = []
            
            for b_idx, blob in enumerate(blob_list):
                gcs_source_uri = os.path.join(src_path, blob.name)
                gcs_destination_uri = os.path.join(dst_path, blob.name)

                # source
                gcs_source = vision.GcsSource(uri=gcs_source_uri)
                input_config = vision.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

                # destination
                gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
                output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=1)

                logging.info(f"started ocr for {b_idx} of {len(blob_list)} files")
                async_request = vision.AsyncAnnotateFileRequest(
                    features=[feature], 
                    input_config=input_config,
                    output_config=output_config
                )
                async_requests.append(async_request)

            operation = client.async_batch_annotate_files(requests=async_requests)
            return operation
            
        except Exception as e:
            logging.error(f"Error in method ocr: {to_trace_str(e)}")
        
    
    def convert_results_to_df(self, results):
        pass
    
    def upload_csv_to_storage(self, csv_path, gcs_path):
        pass
    
    def upload_csv_to_bigquery(self, csv_path, dataset_id, table_id, schema):
        pass

In [106]:
utils = Utils()
src_path = "gs://2021_08_16_tcn_dev"
dst_path = "gs://2021_08_16_tcn_dev"

# utils.convert_pdf_to_png(src_path, dst_path)
operations = utils.ocr(src_path, dst_path)

01:19:29 INFO:started optical character recognition
01:19:29 INFO:started ocr for 0 of 3 files
01:19:29 INFO:started ocr for 1 of 3 files
01:19:29 INFO:started ocr for 2 of 3 files


In [104]:
op = operations[0]
print(type(op))

<class 'google.api_core.operation.Operation'>


In [None]:
op.result()

True