In [1]:
import os
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import re

from typing import Optional

import requests
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai
from google.cloud import storage

from datetime import datetime
from dotenv import load_dotenv
from google.cloud import documentai
from google.cloud.documentai_toolbox import gcs_utilities

In [None]:
load_dotenv()
# DOC_PROCESSOR_KEY = os.getenv('DOC_PROCESSOR_KEY')
DOC_PROCESSOR_ID = os.getenv('DOC_PROCESSOR_ID')
PROJECT_ID = os.getenv('PROJECT_ID')
GCS_BUCKET = os.getenv('GCS_BUCKET')

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.getenv('CRED_FILE')

In [None]:
def create_gcs_bucket(bucket_name, delete=False):
    """Deletes the bucket if it exists, then creates a new bucket in Google Cloud Storage."""
    storage_client = storage.Client()

    # Check if the bucket already exists
    bucket = storage_client.lookup_bucket(bucket_name)
    if bucket and delete:
        print(f"Bucket {bucket_name} already exists. Deleting it.g.")
        bucket.delete(force=True)
        print(f"Bucket {bucket_name} deleted.")

        # create a new bucket
        new_bucket = storage_client.create_bucket(bucket_name)
        print(f"Bucket {new_bucket.name} created.")

    return storage_client.bucket(bucket_name)


In [4]:
bucket = create_gcs_bucket(GCS_BUCKET)

In [5]:
import numpy as np
import cv2
import fitz
import io
import tempfile
from PIL import Image


def preprocess_image(pix: fitz.Pixmap, sharpen: bool = True) -> Image.Image:
    """
    Convert a PyMuPDF pixmap to a PIL.Image that is:
      • grayscale (smaller file, consistent contrast)
      • Otsu‑binarised (keeps faint strokes, kills background noise)
      • lightly sharpened (optional) to recover thin text lines
    Returns a 3‑channel RGB image ready for PDF embedding.
    """
    # pixmap → ndarray (BGR, no alpha)
    arr = np.frombuffer(pix.samples, np.uint8)\
            .reshape(pix.height, pix.width, 4 if pix.alpha else 3)
    if pix.alpha:
        arr = cv2.cvtColor(arr, cv2.COLOR_BGRA2BGR)

    gray   = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)

    # Otsu gives per‑page threshold; safer than “fixed 180”
    _, bin_ = cv2.threshold(gray, 0, 255,
                            cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    if sharpen:
        blur  = cv2.GaussianBlur(bin_, (0, 0), 3)
        bin_  = cv2.addWeighted(bin_, 1.5, blur, -0.5, 0)

    # back to RGB so PIL→PDF works
    rgb = cv2.cvtColor(bin_, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(rgb)

In [6]:
def download_and_upload_pdf(bucket, blob_name, pdf_url, page_count=1):
    """
    Download a PDF from a URL and upload it to GCS.
    
    :param bucket: GCS bucket object
    :param blob_name: name of the PDF file (e.g., the PDF's ID)
    :param pdf_url: URL of the PDF file
    :param page_count: number of pages to download
    
    """
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_content = response.content

        # retrieve the first page_count pages 
        input_pdf = fitz.open(stream=pdf_content, filetype='pdf')

        images = []
        for page_num in range(min(page_count, len(input_pdf))):
            pix = input_pdf[page_num].get_pixmap(dpi=300)
            img = preprocess_image(pix, sharpen=True)
            # img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
            # images.append(img.convert("RGB"))
            images.append(img)

        pdf_buffer = io.BytesIO()
        images[0].save(pdf_buffer, format="PDF", save_all=True, append_images=images[1:])
        pdf_buffer.seek(0)

        # output_pdf = fitz.open()
        # output_pdf.insert_pdf(input_pdf, from_page=0, to_page=page_count - 1)
        
        # convert to temporary "file" for upload 
        # pdf_buffer = io.BytesIO()
        # output_pdf.save(pdf_buffer, incremental=False)
        # pdf_buffer.seek(0)

        # upload to GCS
        blob = bucket.blob(blob_name)
        blob.upload_from_file(pdf_buffer, content_type='application/pdf')
        print(f"Uploaded {blob_name} to GCS.")
    except Exception as e:
        print(f"Failed to process {pdf_url}: {e}")


In [7]:
pdf_links = pd.read_csv("test/pdf_links_test.csv", index_col=0)
pdf_links.head()

Unnamed: 0_level_0,program_name,pdf,Address,City,Zip
program_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2190020840,"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING EN...",https://childcaresearch.ohio.gov//pdf/00219002...,2627 PARK AVE,CINCINNATI,45206
2190020063,1ST CHOICE CHILD CARE,https://childcaresearch.ohio.gov//pdf/00219002...,4303 CLEVELAND AVE,COLUMBUS,43224
300864,1ST FRIENDS LEARNING ACADEMY,https://childcaresearch.ohio.gov//pdf/00000030...,1930 PEARL RD,BRUNSWICK,44212
2190019999,3MB AFTERSCHOOL,https://childcaresearch.ohio.gov//pdf/00219001...,18316 ST. CLAIR AVENUE,CLEVELAND,44110
201048,3T LEARNING ACADEMY LLC 11,https://childcaresearch.ohio.gov//pdf/00000020...,7523 READING ROAD,CINCINNATI,45237


In [8]:
pdf_links.iloc[10]

program_name                                A 2 Z PRESCHOOL, INC.
pdf             https://childcaresearch.ohio.gov//pdf/00000010...
Address                                        190 WOODROW AVENUE
City                                              SAINT CLAIRSVIL
Zip                                                         43950
Name: 100361, dtype: object

In [13]:
fail_pdf = pdf_links[pdf_links.index == 200979]
fail_pdf = fail_pdf.iloc[0]["pdf"]
fail_pdf

'https://childcaresearch.ohio.gov//pdf/000000200979_2024-10-02_ANNUAL.pdf'

In [14]:
len(pd.read_csv("data/pdf_links_final.csv",index_col=0)['pdf'])

3746

In [None]:
from document_AI_run import upload_files_to_gcs
upload_files_to_gcs(
    bucket, 
    pd.read_csv("data/pdf_links_final.csv", index_col=0)['pdf'].iloc[1:50],
    batch_size=10,
    num_workers=5,
    num_pages=1
)

Uploaded input/200644.pdf to GCS.
Uploaded input/107193.pdf to GCS.
Uploaded input/2180016960.pdf to GCS.
Uploaded input/2190020297.pdf to GCS.
Uploaded input/2190020063.pdf to GCS.
Uploaded input/2220026701.pdf to GCS.
Uploaded input/207607.pdf to GCS.
Uploaded input/2180017710.pdf to GCS.
Uploaded input/300864.pdf to GCS.
Uploaded input/107064.pdf to GCS.
Uploaded input/300001.pdf to GCS.
Uploaded input/400468.pdf to GCS.
Uploaded input/2190019999.pdf to GCS.
Uploaded input/2200022316.pdf to GCS.
Uploaded input/2210025267.pdf to GCS.
Uploaded input/401180.pdf to GCS.
Uploaded input/306543.pdf to GCS.
Uploaded input/201048.pdf to GCS.
Uploaded input/2210024741.pdf to GCS.
Uploaded input/2180018327.pdf to GCS.
Uploaded input/2220026936.pdf to GCS.
Uploaded input/2190019520.pdf to GCS.
Uploaded input/2220025784.pdf to GCS.
Failed to process https://childcaresearch.ohio.gov//pdf/002210025291_2025-01-09_ANNUAL.pdf: 404 Client Error: Not Found for url: https://childcaresearch.ohio.gov//pdf

In [56]:
test_link = pdf_links.iloc[0]['pdf']
test_link

'https://childcaresearch.ohio.gov//pdf/002190020840_2023-10-25_ANNUAL.pdf'

In [68]:
# download_and_upload_pdf(bucket, "input/fail.pdf", fail_pdf)

Uploaded input/fail.pdf to GCS.


In [69]:
# download_and_upload_pdf(bucket, "input/002190020840.pdf", test_link)

Uploaded input/002190020840.pdf to GCS.


In [22]:
def get_fields(document):
    fields = {}
    
    for ent in document.entities:

        # process the table (listed as grouped fields/entities)
        if ent.properties:

            for ep in ent.properties:

                ep_label = ep.type_

                if ep.properties:
                    # handle two levels of nesting
                    fields.update({f"{ep_label}-{prop.type_}": prop.mention_text for prop in ep.properties}) 
                else:
                    fields[ep_label] = ep.mention_text

        else:
            fields[ent.type_] = ent.mention_text
            
    return fields

# get_fields(processed)
def process_batch(client: documentai.DocumentProcessorServiceClient, request, timeout, verbose=False):
    
    operation = client.batch_process_documents(request)
    
    # wait until the batch process is complete
    try:
        if verbose:
            print(f"Waiting for operation {operation.operation.name} to complete...")
        operation.result(timeout=timeout)
    # Catch exception when operation doesn't finish before timeout
    except (RetryError, InternalServerError) as e:
        print(e.message)
            
    # NOTE: Can also use callbacks for asynchronous processing
    #
    # def my_callback(future):
    #   result = future.result()
    #
    # operation.add_done_callback(my_callback)

    # After the operation is complete,
    # get output document information from operation metadata
    metadata = documentai.BatchProcessMetadata(operation.metadata)

    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
        raise ValueError(f"Batch Process Failed: {metadata.state_message}")

    storage_client = storage.Client()
    
    documents = []

    if verbose:
        print("Output files:")
    # One process per Input Document
    for process in list(metadata.individual_process_statuses):
        # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
        # The Cloud Storage API requires the bucket name and URI prefix separately
        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
        if not matches:
            print(
                "Could not parse output GCS destination:",
                process.output_gcs_destination,
            )
            continue

        output_bucket, output_prefix = matches.groups()

        # Get List of Document Objects from the Output Bucket
        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

        # Document AI may output multiple JSON files per source file
        current_doc_fields = {}
        doc_ID = None
        for blob in output_blobs:
            # Document AI should only output JSON files to GCS
            if blob.content_type != "application/json":
                print(
                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
                )
                continue
                
            if doc_ID is None:
                doc_ID = blob.name.split("/")[-1].split(".")[0].split("-")[0]

            # Download JSON File as bytes object and convert to Document Object
            if verbose:
                print(f"Fetching {blob.name}")
            document = documentai.Document.from_json(
                blob.download_as_bytes(), ignore_unknown_fields=True
            )
            
            fields = get_fields(document)
            current_doc_fields.update(fields)
            
        documents.append(pd.DataFrame(current_doc_fields, index=[doc_ID]))

    return pd.concat(documents)
from concurrent.futures import ThreadPoolExecutor, as_completed

def extract_from_pdfs(
        gcs_bucket_name: str,
        gcs_prefix: str,
        batch_size: int = 50,
        location='us',
        field_mask: Optional[str] = None,
        timeout: int = 400,
        threads: Optional[int] = None,
        verbose=True
):
    def process_doc_batch(batch):

        request = documentai.BatchProcessRequest(
            name=proc_name,
            input_documents=batch,
            document_output_config=output_config
        )

        return process_batch(client, request, timeout, verbose)

    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    
    # Cloud Storage URI for the Output Directory
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=f"gs://{GCS_BUCKET}/output/", field_mask=field_mask
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    batches = gcs_utilities.create_batches(
        gcs_bucket_name=gcs_bucket_name, 
        gcs_prefix=gcs_prefix, 
        batch_size=batch_size
    )

    proc_name = client.processor_path(PROJECT_ID, location="us", processor=DOC_PROCESSOR_ID)

    if verbose:
        print(f"{len(batches)} batch(es) created.")
    
    processed_docs = [] 
    
    with ThreadPoolExecutor(max_workers=threads) as executor:   
        futures = [executor.submit(process_doc_batch, batch) for batch in batches]

        for future in as_completed(futures):
            try:
                response = future.result()
                processed_docs.append(response)
            except Exception as e:
                print(f"Batch failed: {e}")
        
    return pd.concat(processed_docs)
processed = extract_from_pdfs(
        gcs_bucket_name=GCS_BUCKET,
        gcs_prefix="test/",
        batch_size=2,
        field_mask="entities", 
        threads=1,
        timeout=900
)
processed

1 batch(es) created.
Waiting for operation projects/415387208113/locations/us/operations/15685490811490539594 to complete...
Output files:
Fetching output/15685490811490539594/0/100361-0.json
Fetching output/15685490811490539594/1/201048-0.json


Unnamed: 0,Begin-Time,End-Time,Fire-Inspection-Approval-Date,Food-Service-Risk-Level,Inspection-Date,Inspection-Notice,Inspection-Scope,Infant-Full-Time,Infant-Part-Time,Infant-Total,...,Maximum-Under-2,No-Low-Risk,No-Moderate-Risk,No-Rules-with-Non-compliances,No-Serious-Risk,Occupancy-Limit,Program-Name,Reviewer,Use-Group-Code,Building-Approval-Date
100361,10:20 AM,1:58 PM,03/19/2024,Level II,12/10/2024,Unannounced,Full,11,0,11,...,26,2,0,2,0,100,"A 2 Z PRESCHOOL, INC.",JENNIFER COPE,E,
201048,10:05 AM,4:10 PM,08/29/2024,Level III,10/29/2024,Unannounced,Full,4,0,4,...,22,9,2,9,0,62,3T LEARNING ACADEMY LLC 11,ZIBUTE OSGOOD,E,03/30/2020


In [28]:
pdf_links.iloc[0:2]

Unnamed: 0_level_0,program_name,pdf,Address,City,Zip
program_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2190020840,"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING EN...",https://childcaresearch.ohio.gov//pdf/00219002...,2627 PARK AVE,CINCINNATI,45206
2190020063,1ST CHOICE CHILD CARE,https://childcaresearch.ohio.gov//pdf/00219002...,4303 CLEVELAND AVE,COLUMBUS,43224


In [48]:
pdf_links.index = pdf_links.index.astype(str)

In [52]:
pdf_links.merge(processed, how='outer', left_index=True, right_index=True)

Unnamed: 0,program_name,pdf,Address,City,Zip,Begin-Time,End-Time,Fire-Inspection-Approval-Date,Food-Service-Risk-Level,Inspection-Date,...,Maximum-Under-2,No-Low-Risk,No-Moderate-Risk,No-Rules-with-Non-compliances,No-Serious-Risk,Occupancy-Limit,Program-Name,Reviewer,Use-Group-Code,Building-Approval-Date
100361,"A 2 Z PRESCHOOL, INC.",https://childcaresearch.ohio.gov//pdf/00000010...,190 WOODROW AVENUE,SAINT CLAIRSVIL,43950,10:20 AM,1:58 PM,03/19/2024,Level II,12/10/2024,...,26.0,2.0,0.0,2.0,0.0,100.0,"A 2 Z PRESCHOOL, INC.",JENNIFER COPE,E,
104554,A PLACE TO LEARN AND GROW INCORPORATED,https://childcaresearch.ohio.gov//pdf/00000010...,407 SIXTH ST NW,CANTON,44702,,,,,,...,,,,,,,,,,
107064,A PLACE FOR EVERYONE LLC,https://childcaresearch.ohio.gov//pdf/00000010...,5192 DARROW RD,HUDSON,44236,,,,,,...,,,,,,,,,,
107193,A BEAUTIFUL CHILD'S PRESCHOOL & DAYCARE,https://childcaresearch.ohio.gov//pdf/00000010...,1449 BRITTAIN RD,AKRON,44310,,,,,,...,,,,,,,,,,
200644,A GREAT START PRESCHOOL INC,https://childcaresearch.ohio.gov//pdf/00000020...,7001 FAR HILLS AVE,DAYTON,45459,,,,,,...,,,,,,,,,,
200979,A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239,,,,,,...,,,,,,,,,,
201048,3T LEARNING ACADEMY LLC 11,https://childcaresearch.ohio.gov//pdf/00000020...,7523 READING ROAD,CINCINNATI,45237,10:05 AM,4:10 PM,08/29/2024,Level III,10/29/2024,...,22.0,9.0,2.0,9.0,0.0,62.0,3T LEARNING ACADEMY LLC 11,ZIBUTE OSGOOD,E,03/30/2020
203941,A & D DAY CARE AND LEARNING CENTER INC,https://childcaresearch.ohio.gov//pdf/00000020...,1049 INFIRMARY RD,DAYTON,45418,,,,,,...,,,,,,,,,,
205547,A CHILD'S GARDEN,https://childcaresearch.ohio.gov//pdf/00000020...,5427 JULMAR DRIVE,CINCINNATI,45238,,,,,,...,,,,,,,,,,
207607,A BETTER CHILD CARE CORP,https://childcaresearch.ohio.gov//pdf/00000020...,6945 HARRISON AVENUE,CINCINNATI,45247,,,,,,...,,,,,,,,,,
