# RAG using Document AI

In [1]:
import os
import openai
import io
import uuid
import base64
import time 
import json
from base64 import b64decode
import numpy as np
from PIL import Image
import re
from pydantic import BaseModel


class PromptModel(BaseModel):
    """
    Represents a prompt model.

    Attributes:
        prompt (str): The prompt for the model.
        model (str): The model type. Default is "openai".
    """
    prompt: str
    model: str = "openai"

## 1. Import llamaindex Dependent Libraries

In [2]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.anthropic import Anthropic
from llama_index.core.memory.chat_memory_buffer import ChatMemoryBuffer
from llama_index.core.node_parser import JSONNodeParser

## 2. Import Google Cloud Dependent Libraries

In [3]:
from google.cloud import documentai
from google.api_core.client_options import ClientOptions

In [4]:
from PyPDF2 import PdfReader
from google.cloud import storage
from google.protobuf.json_format import MessageToJson
from google.cloud.documentai_v1 import Document


In [23]:
memory = ChatMemoryBuffer.from_defaults(token_limit=8496)

## Read Environment Variable

In [24]:
OPENAI_API_KEY_TURBO = os.environ["OPENAI_API_KEY_TURBO"]
ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]

DOCUMENTAI_LOCATION = os.environ["DOCUMENTAI_LOCATION"]
DOCUMENTAI_PROJECT_ID = os.environ["DOCUMENTAI_PROJECT_ID"]
DOCUMENTAI_PROCESSOR_ID = os.environ["DOCUMENTAI_PROCESSOR_ID"]

POSTGRES_HOST=os.environ["POSTGRES_HOST"]
POSTGRES_PORT=os.environ["POSTGRES_PORT"]
POSTGRES_DB = os.environ["POSTGRES_DB"]
POSTGRES_USER = os.environ["POSTGRES_USER"]
POSTGRES_PASSWORD = os.environ["POSTGRES_PASSWORD"]

In [25]:
embed_model = OpenAIEmbedding(api_key=OPENAI_API_KEY_TURBO)

In [26]:
llm = Anthropic(model="claude-3-opus-20240229", api_key=ANTHROPIC_API_KEY)

## Read the content in the file

In [27]:
def read_pdf(bucket_name, file_name):
        """
        Reads a PDF file from a Google Cloud Storage bucket.
        Args:
            bucket_name (str): The name of the Google Cloud Storage bucket.
            file_name (str): The name of the PDF file.

        Returns:
            bytes: The content of the PDF file as bytes.
        """
        # bucket_name = "your-bucket-name"

        storage_client = storage.Client()

        # Note: Client.list_blobs requires at least package version 1.17.0.
        bucket = storage_client.bucket(bucket_name)

        blob = bucket.blob(file_name)

        pdf_data = blob.download_as_bytes()

        # pdf_buffer = io.BytesIO(pdf_data)
        # # Read the PDF
        # pdf_reader = PdfReader(pdf_buffer)

        return pdf_data

## Save the Generated Json file to Google Storage

In [28]:
def upload_json(bucket_name, json_data, output_filename):
    """
    Uploads a JSON file to a Google Cloud Storage bucket.

    Args:
        bucket_name (str): The name of the Google Cloud Storage bucket.
        json_data (dict): The JSON data to be uploaded.
        output_filename (str): The name of the output file.

    Returns:
        None
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(f'output/{output_filename}')
    blob.upload_from_string(data=json.dumps(json_data), content_type='application/json')

In [29]:
## Document AI
### 1. Create a OCR Processor
### 2. Get Document Processor ID

In [30]:
def create_vector():
    """
    Creates a vector representation of a PDF document using Document AI.

    Returns:
        dict: A dictionary with a message indicating the status of the vector creation.
    """
    try:
        opts = ClientOptions(api_endpoint=f"{DOCUMENTAI_LOCATION}-documentai.googleapis.com")
        client = documentai.DocumentProcessorServiceClient(client_options=opts)
        name = client.processor_path(project=DOCUMENTAI_PROJECT_ID, location=DOCUMENTAI_LOCATION, processor=DOCUMENTAI_PROCESSOR_ID)

        # Download PDF content from storage
        pdf_content = read_pdf('the_ai_connect', 'medical_anatomy.pdf')

        raw_document = documentai.RawDocument(
            content=pdf_content,
            mime_type="application/pdf",
        )

        request = documentai.ProcessRequest(
            name=name, raw_document=raw_document)
        result = client.process_document(request=request)
        document = result.document
        json_document = Document.to_json(document)

        obj = {
            "mimeType": document.mime_type,
            "text": document.text,
            "uri": document.uri,
        }

        upload_json('the_ai_connect', json_document, 'medical_anatomy.json')
        
        # Save JSON document to a file
        with open(os.path.abspath(os.getcwd()) + "/uploads/medical_anatomy.json", "w") as json_file:
            json.dump(obj, json_file, ensure_ascii=False, indent=4)

        # Load data from the JSON file
        documents = SimpleDirectoryReader(
            input_files=[os.path.abspath(os.getcwd()) + "/uploads/medical_anatomy.json"]
        ).load_data()

        # Create a vector store and index the documents
        storage_context = StorageContext.from_defaults(
            vector_store=PGVectorStore.from_params(
                host=POSTGRES_HOST,
                port=POSTGRES_PORT,
                database=POSTGRES_DB,
                user=POSTGRES_USER,
                password=POSTGRES_PASSWORD,
            )
        )

        VectorStoreIndex(
            JSONNodeParser().get_nodes_from_documents(documents),
            storage_context=storage_context,
            embed_model=embed_model,
        )

        return {"message": "Vector created"}
    except Exception as e:
        print('Error:', e)
        return {"message": "Something went wrong"}

In [31]:
# def extract_json_from_string(input_string: str):
#     # Find all occurrences of the pattern
#     pattern = r"\$\$(.*?)\*\*(.*?)\*\*\$\$"
#     matches = re.findall(pattern, input_string)
#     content = input_string

#     # Process each match and store it in the JSON structure
#     references = {
#         f"reference_{i}": {
#             "label": match[0].split("##")[1],
#             "pageNumber": int(match[1].split(":")[1].strip().split("-")[0]),
#         }
#         for i, match in enumerate(matches, start=1)
#     }

#     # Replace the pattern in the content string
#     for i, match in enumerate(matches, start=1):
#         reference_string = f"##references.reference_{i}.label##"
#         content = content.replace(match[0], reference_string, 1)

#     # Form the final JSON structure
#     return references, content


In [50]:
system_prompt_1 = """
            Provide the appropriate response for the user prompt.
            """

In [None]:

system_prompt_2 = """
            Assist the user to give the detailed response from the document with the references.
            """



In [51]:

system_prompt_3 = """
            You are a helpful document assistant. In your response, please use references with page numbers or
            headings or figures to the Medical Anatomy PDF and return in below format.
            Sample Response:
                1. This is the content 1 (**Section: 1.1.1 (Page: 1)**)
                2. This is the content 2 (**Table:1.2.1 (Page: 1)**)
                3. This is the content 3 (**Figure: 1.3.1 (Page: 1)**)
            """


In [57]:
def doc_chat(prompt: PromptModel, system_prompt):
    try:
        

        vector_store = PGVectorStore.from_params(
            host=POSTGRES_HOST,
            port=POSTGRES_PORT,
            database=POSTGRES_DB,
            user=POSTGRES_USER,
            password=POSTGRES_PASSWORD,
        )

        index = VectorStoreIndex.from_vector_store(
            vector_store=vector_store, embed_model=embed_model
        )

        chat_engine = index.as_chat_engine(
            chat_mode="context",
            llm=llm,
            # memory=memory,  ## Memory is used to persist the conversation history.
            system_prompt=system_prompt,
        )

        response = chat_engine.chat(prompt["prompt"])
        # references, content = extract_json_from_string(response.response)
        return response.response
    except Exception as e:
        print('Error: ', e)
        return {"message": "Something went wrong"}


In [53]:
create_vector()

{'message': 'Vector created'}

In [54]:
# Assuming you have a prompt object
prompt = {"prompt": "What is joint receptors"}
result = doc_chat(prompt, system_prompt_1)
print("Result:", result)

Result: Joint receptors are sensory receptors located in the joints that provide information about joint position, movement, and forces acting on the joint. There are four main types of joint receptors:

1. Type 1 (Ruffini endings): These are slowly adapting receptors that signal joint position and the direction, speed, and amplitude of joint movements. They are located in the joint capsule, ligaments, and menisci.

2. Type 2 (Pacinian corpuscles): These are rapidly adapting receptors that detect acceleration and deceleration of joint movements. They are located in the joint capsule, ligaments, and fat pads.

3. Type 3 (Golgi tendon organ-like endings): These receptors are located in the ligaments and respond to tension forces acting on the joint.

4. Type 4 (Free nerve endings): These are nociceptors that respond to potentially harmful mechanical and chemical stimuli. They signal pain associated with joint damage or inflammation.

Joint receptors work together with muscle spindles and

In [55]:
# Assuming you have a prompt object
prompt = {"prompt": "What is joint receptors"}
result = doc_chat(prompt, system_prompt_2)
print("Result:", result)

Result: According to the passage, joint receptors are low-threshold mechanoreceptors that signal different characteristics of joint function. Specifically:

"The joint receptors are low-threshold mechanoreceptors and have been divided into four groups. They signal different characteristics of joint function (position, movements, direction and speed of movements)." 

The passage also mentions that "The free receptors or type 4 joint receptors are nociceptors."

In summary, joint receptors are sensory receptors located in joints that detect and signal information about joint position, movement, direction, and speed. They are mostly low-threshold mechanoreceptors, with one type (type 4) being nociceptors that detect pain.


In [56]:
# Assuming you have a prompt object
prompt = {"prompt": "What is joint receptors"}
result = doc_chat(prompt, system_prompt_3)
print("Result:", result)

Result: Joint receptors are sensory receptors located in the joints that provide information about joint position, movement, and direction. They are important for proprioception, which is the sense of the position and movement of our limbs and body.

According to the information provided (**Section: Joint receptors (Page: 4)**):

"The joint receptors are low-threshold mechanoreceptors and have been divided into four groups. They signal different characteristics of joint function (position, movements, direction and speed of movements). The free receptors or type 4 joint receptors are nociceptors."

So in summary, joint receptors are sensory receptors in the joints that detect position, movement, direction and speed of the joints. They are classified into four types, with type 4 being nociceptors that detect pain.
