In [None]:
from unstructured.partition.pdf import partition_pdf

#### Partition_pdf reads your PDF and extracts:
    1.Text
    2.Images (as files, if present)
    3.Tables (as files, if present)
    
 The extracted images and tables are saved in the extracted_data directory.

 The result, raw_pdf_elements, is a list of elements (text, image, table, etc.) that you can further process for RAG (Retrieval-Augmented Generation).


 This code sets up and runs advanced PDF parsing, extracting not just text but also images and tables from your PDF, and saves them for further use in RAG pipeline. 


In [None]:
# Install poppler if not already installed
# For Windows, downloads are available at: http://blog.alivate.com.au/poppler-windows/
# For Linux/macOS, you can use: sudo apt-get install poppler-utils or brew install poppler

# In Jupyter, you can try:
#%pip install poppler-utils

# Install pytesseract (Python wrapper for Tesseract)
#%pip install pytesseract

# NOTE: You must also install the Tesseract binary on your system.
# For Windows: Download and install from https://github.com/tesseract-ocr/tesseract
# https://github.com/UB-Mannheim/tesseract/wiki
# For Linux: sudo apt-get install tesseract-ocr
# For macOS: brew install tesseract

raw_pdf_elements=partition_pdf(
    filename="./Data/GenAI_Report_2023_011124.pdf",                  # mandatory
    strategy="hi_res",                                 # mandatory to use ``hi_res`` strategy
    extract_images_in_pdf=True,                       # mandatory to set as ``True``
    extract_image_block_types=["Image", "Table"],          # optional
    extract_image_block_to_payload=False,                  # optional
    extract_image_block_output_dir="extracted_data",  # optional - only works when ``extract_image_block_to_payload=False``
    )

In [None]:
raw_pdf_elements

In [None]:
# Extracting specific types of elements from the raw PDF elements
Header=[]
Footer=[]
Title=[]
#NarrativeText=[]
Text=[]
ListItem=[]
for element in raw_pdf_elements:
  if "unstructured.documents.elements.Header" in str(type(element)):
            Header.append(str(element))
  elif "unstructured.documents.elements.Footer" in str(type(element)):
            Footer.append(str(element))
  elif "unstructured.documents.elements.Title" in str(type(element)):
            Title.append(str(element))
 # elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
 #           NarrativeText.append(str(element))
  elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
            Text.append(str(element))
  elif "unstructured.documents.elements.ListItem" in str(type(element)):
            ListItem.append(str(element))



In [None]:
# Extracting images from the raw PDF elements
img=[]
for element in raw_pdf_elements:
  if "unstructured.documents.elements.Image" in str(type(element)):
            img.append(str(element))

In [None]:
# Extracting tables from the raw PDF elements
tab=[]
for element in raw_pdf_elements:
  if "unstructured.documents.elements.Table" in str(type(element)):
            tab.append(str(element))

In [None]:
tab


In [None]:
img

### Table Summary

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser




In [None]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables for retrieval. \
    These summaries will be embedded and used to retrieve the raw table elements. \
    Give a concise summary of the table that is well optimized for retrieval. Table:{element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
prompt

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
# Load environment variables from .env file
load_dotenv()
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")



# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
table_summaries = []
table_summaries = summarize_chain.batch(tab, {"max_concurrency": 5})


In [None]:
table_summaries

### Text Summary

In [None]:
Text

In [None]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text elements. \
    Give a concise summary of the table or text that is well optimized for retrieval.text: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4.1-mini")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
text_summaries = []
text_summaries = summarize_chain.batch(Text, {"max_concurrency": 5})
text_summaries



### Image Summary

In [None]:
import base64
from langchain_core.messages import HumanMessage

In [None]:
"""Usage: call encode_image("path/to/image.png") to get a base64 string of that image, 
which you can then include in prompts or send to a multi-modal LLM."""

def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [None]:
"""
Purpose:
This function sends an image (as a base64 string) 
and a prompt to a multi-modal LLM (GPT-4 Vision) and returns the models response. 
It is used to generate a summary or description of the image."""

def image_summarize(img_base64, prompt):
    """Make image summary"""
    chat = ChatOpenAI(model="gpt-4.1-mini", max_tokens=1024)

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content

In [None]:
"""Purpose:
This function processes all .jpg images in a given directory, encodes them to base64, 
and generates a summary for each image using a multi-modal LLM (like GPT-4 Vision)."""

def generate_img_summaries(path):
    """
    Generate summaries and base64 encoded strings for images
    path: Path to list of .jpg files extracted by Unstructured
    """

    # Store base64 encoded images
    img_base64_list = []

    # Store image summaries
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Give a concise summary of the image that is well optimized for retrieval."""

    # Apply to images
    for img_file in sorted(os.listdir(path)):
        if img_file.endswith(".jpg"):
            img_path = os.path.join(path, img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))


    return img_base64_list, image_summaries

In [None]:
fpath="./extracted_data"
img_base64_list, image_summaries = generate_img_summaries(fpath)


In [None]:
img_base64_list

In [None]:
image_summaries

#### Multivector Retriver

In [None]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings


In [None]:
def create_multi_vector_retriever(vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )


    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):

      doc_ids = [str(uuid.uuid4()) for _ in doc_contents]

      summary_docs = [
              Document(page_content=s, metadata={id_key: doc_ids[i]})
              for i, s in enumerate(doc_summaries)
          ]

      retriever.vectorstore.add_documents(summary_docs)
      retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

      # Add texts, tables, and images
      # Check that text_summaries is not empty before adding
      if text_summaries:
          add_documents(retriever, text_summaries, Text)
      # Check that table_summaries is not empty before adding
      if table_summaries:
          add_documents(retriever, table_summaries, tab)
      # Check that image_summaries is not empty before adding
      if image_summaries:
          add_documents(retriever, image_summaries, img)

    return retriever

vectorstore = Chroma(
    collection_name="mm_rag", embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
    vectorstore,
    text_summaries,
    Text,
    table_summaries,
    tab,
    image_summaries,
    img_base64_list,
)

In [None]:
retriever_multi_vector_img


In [None]:
import io
import re

from IPython.display import HTML, display
from PIL import Image



In [None]:
def plt_img_base64(img_base64):
    """Disply base64 encoded string as image"""
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))


In [None]:
plt_img_base64(img_base64_list[1])

In [None]:
image_summaries[1]

In [None]:
def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None

In [None]:
def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xFF\xD8\xFF": "jpg",
        b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

In [None]:
def resize_base64_image(base64_string, size=(128, 128)):
    """
    Resize an image encoded as a Base64 string
    """
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = io.BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

In [None]:
def split_image_text_types(docs):
    """
    Split base64-encoded images and texts
    """
    b64_images = []
    texts = []

    for doc in docs:
        # Check if the document is of type Document and extract page_content if so
        if isinstance(doc, Document):
            doc = doc.page_content
        if looks_like_base64(doc) and is_image_data(doc):
            doc = resize_base64_image(doc, size=(1300, 600))
            b64_images.append(doc)
        else:
            texts.append(doc)

    return {"images": b64_images, "texts": texts}

In [None]:
def img_prompt_func(data_dict):
    """
    Join the context into a single string
    """
    #print(data_dict)
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = []

    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}"},
            }
            messages.append(image_message)

    # Adding the text for analysis
    text_message = {
        "type": "text",
        "text": (
            "You are a helpful assistant.\n"
            "You will be given a mixed info(s) .\n"
            "Use this information to provide relevant information to the user question. \n"
            f"User-provided question: {data_dict['question']}\n\n"
            "Text and / or tables:\n"
            f"{formatted_texts}"
        ),
    }
    messages.append(text_message)
    return [HumanMessage(content=messages)]

In [None]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

In [None]:
def multi_modal_rag_chain(retriever):
    """
    Multi-modal RAG chain
    """

    # Multi-modal LLM
    model = ChatOpenAI(temperature=0, model="gpt-4.1-mini", max_tokens=1024)


    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(split_image_text_types),
            "question": RunnablePassthrough(),
        }
        | RunnableLambda(img_prompt_func)
        | model
        | StrOutputParser()
    )
    return chain

In [None]:
# Create RAG chain
chain_multimodal_rag = multi_modal_rag_chain(retriever_multi_vector_img)

In [None]:
chain_multimodal_rag

In [None]:
# Check retrieval
query = "categories"
docs = retriever_multi_vector_img.invoke(query)

In [None]:
docs

In [None]:
print("Text summaries:", len(text_summaries))
print("Table summaries:", len(table_summaries))
print("Image summaries:", len(image_summaries))
print("Text:", len(Text))
print("Tab:", len(tab))
print("Images:", len(img_base64_list))

In [None]:

chain_multimodal_rag.invoke("What are the categories of AI?")