In [75]:
import os
import openai
import io
import uuid
import base64
import time 
from base64 import b64decode
import numpy as np
from PIL import Image

In [76]:
!pip install langchain-community langchain-core

[0m

In [77]:
from unstructured.partition.pdf import partition_pdf

from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

from operator import itemgetter

In [78]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [79]:
def doc_partition(path, file_name):
  """
  Partition a PDF document into smaller chunks based on specified criteria.

  Args:
    path (str): The path to the directory where the PDF file is located.
    file_name (str): The name of the PDF file.

  Returns:
    list: A list of raw PDF elements after partitioning.

  """
  raw_pdf_elements = partition_pdf(
    filename=path + file_name,
    image_output_dir_path=path,
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
  )

  return raw_pdf_elements

In [80]:
poppler_path = os.path.abspath(os.getcwd()) + "/content/"
path = os.path.abspath(os.getcwd()) + "/content/"
file_name = "medical_anatomy.pdf"
raw_pdf_elements = doc_partition(path,file_name)

In [81]:
def data_category(raw_pdf_elements):
   """
   Categorizes the elements in the raw PDF.

   Args:
      raw_pdf_elements (list): A list of elements from the raw PDF.

   Returns:
      list: A list containing two sublists: texts and tables.
         - texts: A list of text elements.
         - tables: A list of table elements.
   """
   tables = []
   texts = []
   for element in raw_pdf_elements:
      if "unstructured.documents.elements.Table" in str(type(element)):
           tables.append(str(element))
      elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
           texts.append(str(element))
   return [texts, tables]

In [82]:
texts = data_category(raw_pdf_elements)[0]
tables = data_category(raw_pdf_elements)[1]

In [83]:
# function to take tables and text and then summarize tables only,
def tables_summarize(data_category):
    """
    Summarizes a list of tables or text chunks.

    Args:
        tables (list): A list of tables or text chunks to be summarized.

    Returns:
        list: A list of concise summaries corresponding to each table or text chunk.

    """
    prompt_text = """You are an assistant tasked with summarizing tables and text. \
                    Give a concise summary of the table or text. Table or text chunk: {element} """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0, model="gpt-4")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
    #text_summaries =  summarize_chain.batch(data_category[0], {"max_concurrency": 5})# no need to summarize

    return table_summaries

In [84]:
table_summaries = tables_summarize(data_category)
text_summaries = texts

In [85]:
def encode_image(image_path):
    ''' Encode an image file to base64 string '''
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
    return encoded_string

def image_captioning(img_base64, prompt):
    '''
    Generate image caption using OpenAI Chat model.

    Parameters:
    - img_base64 (str): The base64 encoded string representation of the image.
    - prompt (str): The prompt or initial message for the chat model.

    Returns:
    - str: The generated image caption.

    Example:
    >>> img_base64 = "iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAA..."
    >>> prompt = "Describe the image."
    >>> caption = image_captioning(img_base64, prompt)
    >>> print(caption)
    "A beautiful sunset over the ocean."
    '''
    chat = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024)

    message = [
        HumanMessage(
            content=[
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}},
            ]
        )
    ]

    response = chat.invoke(message)
    return response.content

In [86]:
import time
# Store base64 encoded images
img_base64_list = []

# Store image summaries
image_summaries = []

# Prompt
prompt = "Describe the image in detail. Be specific about graphs, such as bar plots."

# Read images, encode to base64 strings
for img_file in sorted(os.listdir(path)):
    if img_file.endswith('.jpg'):
        img_path = os.path.join(path, img_file)
        base64_image = encode_image(img_path)
        img_base64_list.append(base64_image)
        #image_summaries.append(image_captioning(base64_image,prompt))
        img_cap = image_captioning(base64_image,prompt)
        time.sleep(10)
        image_summaries.append(img_cap)


In [87]:
text_summaries

['Anatomy of the Somatosensory System\n\n1 FROM WIKIBOOKS\n\nOur somatosensory system consists of sensors in the skin and sensors in our muscles, tendons, and joints. The re- ceptors in the skin, the so called cutaneous receptors, tell us about temperature (thermoreceptors), pressure and sur- face texture (mechano receptors), and pain (nociceptors). The receptors in muscles and joints provide information about muscle length, muscle tension, and joint angles.\n\nThis is a sample document to showcase page-based formatting. It contains a chapter from aWikibook calledSensory Systems. None of the content has been changed in this article, but some content has been removed.\n\nCutaneous receptors\n\nSensory information from Meissner corpuscles and rapidly adapting afferents leads to adjustment of grip force when objects are lifted. These afferents respond with a brief burst of action potentials when objects move a small dis- tance during the early stages of lifting. In response to\n\nSebaceou

In [88]:
table_summaries

['The table describes different types of skin receptors and their functions. Hair receptors and Meissner’s corpuscles are rapidly adapting surface receptors with small receptive fields, used for detecting insects, fine vibrations, and recognizing texture. Pacinian corpuscles are deep receptors with large receptive fields, used for detecting diffuse vibrations like tapping with a pencil. Merkel’s receptors are slowly adapting and used for detecting spatial details like a round surface edge or brail. Ruffini’s corpuscles are used for detecting skin stretch and determining joint position in fingers.']

In [90]:
image_summaries

['The image is a detailed illustration of a cross-section of human skin, showing the differences between hairy skin and glabrous skin (which is hairless, like the skin on the palms of our hands or soles of our feet). The skin is depicted as a layered structure with various components labeled to show their position and relationship to each other.\n\nOn the left side, labeled as "Hairy skin," there is a hair shaft protruding from the surface, with a hair receptor at the base, indicating a sensory function related to the hair. Below the hair receptor are other structures such as a sebaceous gland, which secretes oils for hair and skin lubrication, and a Pacinian corpuscle, which is a nerve receptor that detects pressure and vibration.\n\nTransitioning to the right, the skin changes to "Glabrous skin," where there are no hairs. Instead, the surface shows papillary ridges which are typical of the fingerprint patterns found on fingertips. Within the glabrous skin, we can see a Meissner\'s co

In [91]:
def split_image_text_types(docs):
    ''' 
    Split base64-encoded images and texts.
    
    Args:
        docs (list): A list of documents containing base64-encoded images and texts.
        
    Returns:
        dict: A dictionary containing two lists - "images" and "texts". The "images" list contains the base64-encoded images, and the "texts" list contains the texts.
    '''
    images = []
    texts = []
    for doc in docs:
        try:
            b64decode(doc)
            images.append(doc)
        except binascii.Error:
            texts.append(doc)
    return {
        "images": images,
        "texts": texts
    }

In [92]:
vectorstore = Chroma(collection_name="multi_modal_rag",
                     embedding_function=OpenAIEmbeddings())

In [93]:
store = InMemoryStore()
id_key = "doc_id"

In [94]:
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [95]:
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# Add image summaries
img_ids = [str(uuid.uuid4()) for _ in img_base64_list]
summary_img = [
    Document(page_content=s, metadata={id_key: img_ids[i]})
    for i, s in enumerate(image_summaries)
]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(list(zip(img_ids, img_base64_list)))

In [96]:
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

def prompt_func(dict):
    format_texts = "\n".join(dict["context"]["texts"])
    return [
        HumanMessage(
            content=[
                {"type": "text", "text": f"""Answer the question based only on the following context, which can include text, tables, and the below image:
Question: {dict["question"]}

Text and tables:
{format_texts}
"""},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{dict['context']['images'][0]}"}},
            ]
        )
    ]

model = ChatOpenAI(temperature=0, model="gpt-4-vision-preview", max_tokens=1024)

# RAG pipeline
chain = (
    {"context": retriever | RunnableLambda(split_image_text_types), "question": RunnablePassthrough()}
    | RunnableLambda(prompt_func)
    | model
    | StrOutputParser()
)

In [97]:
chain.invoke(
    "Explain about the feedback loops"
)

'The image depicts a schematic representation of the neuromuscular feedback loops involved in muscle control. Feedback loops are systems in which outputs of a process are used as inputs to control the behavior of the process itself, often leading to regulation of the function.\n\nIn the context of the image, there are two primary feedback mechanisms illustrated:\n\n1. Muscle Spindle Feedback Loop:\n   - Muscle spindles are sensory receptors within the muscle that detect changes in muscle length and the rate of change in length (velocity).\n   - When a muscle stretches, the spindles are also stretched and send information (length and velocity feedback) to the central nervous system (CNS) via afferent nerve fibers.\n   - The CNS processes this information and sends a response back to the muscle via α motor neurons (driving signal) to adjust the contraction and control muscle length.\n   - Additionally, γ motor neurons (gamma bias) adjust the sensitivity of the muscle spindles.\n\n2. Golg