# Vertex AI MLOps Book - Chapter 12 - GenAI - Document Data Extraction

In [1]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#### **Objective:** In this notebook we will use Veretx AI PALM Text model to extract ententities from a scanned PDF containing Patent Information

### Set up and import dependencies

In [None]:
#install dependencies
!pip install google-cloud-aiplatform --upgrade
!pip install google-cloud-documentai
#!pip install google-cloud-storage


In [3]:
#import libraries

#from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import vertexai
from vertexai.preview.language_models import TextGenerationModel

#from PIL import Image, ImageDraw
#import os
import pandas as pd

#from pdf2image import convert_from_path, convert_from_bytes

2023-12-18 06:46:42.839955: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 06:46:42.886715: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-18 06:46:42.886752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-18 06:46:42.888293: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-18 06:46:42.895680: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 06:46:42.896464: I tensorflow/core/platform/cpu_feature_guard.cc:1

### Authentication 

In [38]:
# Uncomment and run this cell if running notebook locally
#! gcloud auth login

#### Uncomment and run the following section only if using Google Colab Notebooks

In [22]:

#from google.colab import auth as google_auth
#google_auth.authenticate_user()

### Provide GCP Project ID

In [50]:
PROJECT_ID = "jsb-alto"

### Define path to the pdf file

In [23]:
file_path='./sample_data/US_PTO_Sample.pdf'


## Display and review the PDF File

In [24]:

from IPython.display import IFrame
IFrame(file_path, width=800, height=700)

# Use GCP Document AI to OCR the PDF

## Stop if you have not already created/provisioned a Doc AI OCR parser in GCP Document AI
See "Chapter 13, Document AI – an End-to-End Solution for Processing Documents" for more details.

Key Steps to deploy Document AI - OCR Parser before you can proceed:
1. Navigate to GCP Console and in the search bar search for "Document AI" and click on it
2. Click on 'Processor Gallery'
3. Click on 'Create Processor' button under 'Document OCR'
4. Provide a new processor name and click 'Create'
5. Copy the 'Processor Id' under 'basic information section.

Enter the processor ID in the 'Doc AI configuration section below.


## Doc AI Configurations

#### Use this config. for PDF Files

In [51]:
#For PDF Docs
ocr_output = process_document_sample(
  project_id=PROJECT_ID, # Replace with your own processor ID
  location="us",
  processor_id="2fb6b1be15c7f2d", # Replace with your own processor ID
    mime_type = 'application/pdf',
    field_mask = None,
  file_path= file_path
)

#### Use this config.for TIFF files

In [52]:
##For TIFF docs uncomment the below section and run
#ocr_output = process_document_sample(
#  project_id="398507275014",
#  location="us",
#  processor_id="2fb6b1be15c7f2d",
#    mime_type = 'image/tiff',
#    field_mask = None,
#  file_path="./genai_demo_data/demo_data.tiff"
#)

In [53]:
# Define function to OCR the PDF using Document AI
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: str = None,
):

    client = documentai.DocumentProcessorServiceClient()

    name = client.processor_path(project_id, location, processor_id)

    # Import the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load the image content
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)


    request = documentai.ProcessRequest(
        name=name, raw_document=raw_document
    )

    result = client.process_document(request=request)


    document = result.document

    # Read the text recognition output from the processor
    return(document.text)



In [54]:
#Print the first 1000 characters of the OCR output
print(ocr_output[:32000])

(12) United States Patent
Lethin et al.
(54) METHODS AND APPARATUS FOR LOCAL
MEMORY COMPACTION
(75) Inventors: Richard A. Lethin, New York, NY (US);
Allen K. Leung, New York, NY (US);
Benoit J. Meister, New York, NY (US);
Nicolas T. Vasilache, New York, NY
(US); David E. Wohlford, Portland, OR
(US)
(73) Assignee: Reservoir Labs, Inc., New York, NY
(US)
(*) Notice:
(21) Appl. No.: 12/365,780
(22) Filed:
Feb. 4, 2009
(65)
Prior Publication Data
US 2010/0192138 A1 Jul. 29, 2010
Related U.S. Application Data
(60) Provisional application No. 61/065,294, filed on Feb.
8, 2008.
(51) Int. Cl.
(56)
Subject to any disclaimer, the term of this
patent is extended or adjusted under 35
U.S.C. 154(b) by 1287 days.
G06F 9/45
(52) U.S. CI.
USPC
(58)
5,442,699 A
5,442,797 A
5,613,136 A
(2006.01)
Field of Classification Search
USPC
See application file for complete search history.
References Cited
U.S. PATENT DOCUMENTS
717/151
8/1995 Arnold et al.
8/1995 Casavant et al.
3/1997 Casavant et al.
717/151
Sta

### Run the OCR results above through the Vertex AI GenAI/PALM Model to extact entities

In [55]:
# Define the function to process OCR output through Vertex AI GenAI Model

def predict_large_language_model_sample(
    project_id: str,
    model_name: str,
    temperature: float,
    max_decode_steps: int,
    top_p: float,
    top_k: int,
    content: str,
    location: str = "us-central1",
    tuned_model_name: str = "",
    ) :
    """Predict using a Large Language Model."""
    vertexai.init(project=project_id, location=location)
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name:
      model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        content,
        temperature=temperature,
        max_output_tokens=max_decode_steps,
        top_k=top_k,
        top_p=top_p,)
    print(f"Response from Model: {response.text}")
    return(response.text)


### Enter the prompt to be used for entity etxraction from the document

In [56]:
prompt_suffix = '''What is the name of the inventor(s) and their respective locations. Provide answer in form of a table with first column providing Inventor's name, second column providing their location and third column providing Patent Number??'''



### Combine the OCR output and the prompt/question above to create full input text to be fed to the model

In [57]:
ocr_text = ocr_output+prompt_suffix
print(ocr_text[5000:20000]) #Limiting to 20K characters in teh notebook. Model can handle 8K Tokens = ~32K Characters

arte and Vivien's Algorithm, "Chapter 5: Parallelism Detection In
Nested Loops", pp. 193-226.
"The Cell Roadmap", Published on PPCNUX at http://www.ppcnux.
com/?q=print/6666.
"ClearSpeed™ Introductory Programming Manual The
ClearSpeed Software Development Kit", ClearSpeed Technology
Inc. 2007.
"ClearSpeed™ ClearSpeed Programming Model: An introduction",
ClearSpeed Technology Inc. 2007.
"ClearSpeed™ ClearSpeed Programming Model: Card-side Librar-
ies", ClearSpeed Technology Inc. 2007.
"ClearSpeed™ ClearSpeed Programming Model: Optimizing Per-
formance", ClearSpeed Technology Inc. 2007.
Ayers et al, Aggressive inlining, PLDI '92 Las Vegas, NV, USA.
Bastoul, "Efficient Code Generation for Automatic Parallelization
and Optimization", Proceedings of the Second International Sympo-
sium on Parallel and Distributed Computing, 2003.
Bastoul, "Code Generation in the Polyhedral Model Is Easier Than
You Think", Proceedings of the 13th International Conference on
Parallel Architecture and Compilat

### Feed the input prompt to the LLM

In [58]:
# Process the full Input Text through the GenAI Model
llm_output = predict_large_language_model_sample(PROJECT_ID, #GCP Project
                                                 "text-bison@001", #LLM Model 
                                                 0.2, #Temperature
                                                 256, #Max output tokens
                                                 0.8, #Top K
                                                 40,  #Top P
                                                 ocr_text, 
                                                 "us-central1")

Response from Model: | Inventor | Location | Patent Number |
|---|---|---|
| Richard A. Lethin | New York, NY (US) | 8,661,422 B2 |
| Allen K. Leung | New York, NY (US) | 8,661,422 B2 |
| Benoit J. Meister | New York, NY (US) | 8,661,422 B2 |
| Nicolas T. Vasilache | New York, NY (US) | 8,661,422 B2 |
| David E. Wohlford | Portland, OR (US) | 8,661,422 B2 |


### Print the response

In [59]:
#Print the answer received from LLM. 
#In this Patent document use case, answer should the name of the inventors
print(llm_output)

| Inventor | Location | Patent Number |
|---|---|---|
| Richard A. Lethin | New York, NY (US) | 8,661,422 B2 |
| Allen K. Leung | New York, NY (US) | 8,661,422 B2 |
| Benoit J. Meister | New York, NY (US) | 8,661,422 B2 |
| Nicolas T. Vasilache | New York, NY (US) | 8,661,422 B2 |
| David E. Wohlford | Portland, OR (US) | 8,661,422 B2 |


## Response from the LLM should look like the table below:

| Inventor | Location |
|---|---|
| Richard A. Lethin | New York, NY (US) |
| Allen K. Leung | New York, NY (US) |
| Benoit J. Meister | New York, NY (US) |
| Nicolas T. Vasilache | New York, NY (US) |
| David E. Wohlford | Portland, OR (US) |

In [60]:
import io

output = pd.read_csv(io.StringIO(llm_output), sep='|')

In [61]:
output = output.dropna(axis=1, how='all')
output = output.dropna(axis=0, how='all')
# remove special character
output.columns = output.columns.str.replace(' ', '')

In [62]:
output

Unnamed: 0,Inventor,Location,PatentNumber
0,---,---,---
1,Richard A. Lethin,"New York, NY (US)","8,661,422 B2"
2,Allen K. Leung,"New York, NY (US)","8,661,422 B2"
3,Benoit J. Meister,"New York, NY (US)","8,661,422 B2"
4,Nicolas T. Vasilache,"New York, NY (US)","8,661,422 B2"
5,David E. Wohlford,"Portland, OR (US)","8,661,422 B2"


### Push to BQ

In [None]:
import datetime

from google.cloud import bigquery
import pandas
import pytz

# Construct a BigQuery client object.
client = bigquery.Client()

## Modify the table_id in next cell to table name you want to use
At the very least replace the 'jsb-alto' to your project name

In [64]:
#Set table_id to the ID of the table to create.
table_id = "jsb-alto.entity_extract.patent_data_extract"

In [65]:





dataframe = output


job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("Inventor", bigquery.enums.SqlTypeNames.STRING),
        # Indexes are written if included in the schema by name.
        bigquery.SchemaField("Location", bigquery.enums.SqlTypeNames.STRING),
        # Indexes are written if included in the schema by name.
        bigquery.SchemaField("PatentNumber", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

job = client.load_table_from_dataframe(
    dataframe, table_id, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

Loaded 6 rows and 3 columns to jsb-alto.entity_extract.patent_data_extract


### Query BQ

In [66]:
%%bigquery
SELECT * FROM jsb-alto.entity_extract.patent_data_extract

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Inventor,Location,PatentNumber
0,---,---,---
1,Richard A. Lethin,"New York, NY (US)","8,661,422 B2"
2,Allen K. Leung,"New York, NY (US)","8,661,422 B2"
3,Benoit J. Meister,"New York, NY (US)","8,661,422 B2"
4,Nicolas T. Vasilache,"New York, NY (US)","8,661,422 B2"
5,David E. Wohlford,"Portland, OR (US)","8,661,422 B2"
