In [3]:
"""OCR with PDF/TIFF as source files on GCS"""
import os
import json
import re
from google.cloud import vision
from google.cloud import storage

# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = "Slides/145-0204.pdf"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'savvy-hybrid-389017-984cabbf64b0.json'

# How many pages should be grouped into each json output file.
batch_size = 2

client = vision.ImageAnnotatorClient()

feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(
    gcs_destination=gcs_destination, batch_size=batch_size
)

async_request = vision.AsyncAnnotateFileRequest(
    features=[feature], input_config=input_config, output_config=output_config
)

operation = client.async_batch_annotate_files(requests=[async_request])

print("Waiting for the operation to finish.")
operation.result(timeout=420)

# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()

match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)

bucket = storage_client.get_bucket(bucket_name)

# List objects with the given prefix, filtering out folders.
blob_list = [
    blob
    for blob in list(bucket.list_blobs(prefix=prefix))
    if not blob.name.endswith("/")
]
print("Output files:")
for blob in blob_list:
    print(blob.name)

# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]

json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)

# The actual response for the first page of the input file.
first_page_response = response["responses"][0]
annotation = first_page_response["fullTextAnnotation"]

# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print("Full text:\n")
print(annotation["text"])


NameError: name 'gcs_source_uri' is not defined

In [2]:
pip install --upgrade google-cloud-documentai

Collecting google-cloud-documentai
  Downloading google_cloud_documentai-2.18.0-py2.py3-none-any.whl (292 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.0/292.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: google-cloud-documentai
Successfully installed google-cloud-documentai-2.18.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from google.cloud import documentai
import os

# Set up Document AI client
documentai_client = documentai.DocumentProcessorServiceClient()

# Folder path containing PDFs 
pdf_folder = 'test-Set'

# Loop through all PDFs in folder
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        
        # Read PDF file into memory
        pdf_path = os.path.join(pdf_folder, filename)
        with open(pdf_path, "rb") as pdf_file:
            pdf_bytes = pdf_file.read()
            
        # Configure the process request
        request = documentai.ProcessRequest(
            name=processor.name, # Processor name
            raw_document=documentai.RawDocument(
                content=pdf_bytes,
                mime_type="application/pdf"
            )
        )
        
        # Call the Document AI API to process the PDF
        result = documentai_client.process_document(request=request)
        
        # Extract text from response
        pdf_text = result.document.text
        
        # Save text to output text file
        output_path = os.path.join(pdf_folder, os.path.splitext(filename)[0] + ".txt")
        with open(output_path, "w") as text_file:
            text_file.write(pdf_text)
            
print("OCR complete for all PDFs in folder.")


ImportError: cannot import name 'documentai' from 'google.cloud' (unknown location)

In [None]:

from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore

# TODO(developer): Uncomment these variables before running the sample.
# project_id = "YOUR_PROJECT_ID"
# location = "YOUR_PROCESSOR_LOCATION"  # Format is "us" or "eu"
# file_path = "/path/to/local/pdf"
# processor_display_name = "YOUR_PROCESSOR_DISPLAY_NAME" # Must be unique per project, e.g.: "My Processor"


def quickstart(
    project_id: str,
    location: str,
    file_path: str,
    processor_display_name: str = "My Processor",
):
    # You must set the `api_endpoint`if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location, e.g.:
    # `projects/{project_id}/locations/{location}`
    parent = client.common_location_path(project_id, location)

    # Create a Processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            type_="OCR_PROCESSOR",  # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
            display_name=processor_display_name,
        ),
    )

    # Print the processor information
    print(f"Processor Name: {processor.name}")

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(
        content=image_content,
        mime_type="application/pdf",  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
    )

    # Configure the process request
    # `processor.name` is the full resource name of the processor, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}`
    request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    print(document.text)