In [None]:
%pip install -Uq "unstructured[all-docs]"
%pip install -Uq langchain_chroma
%pip install -Uq langchain langchain_community langchain_openai
%pip install -Uq python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install -Uq groq
%pip install -Uq pymupdf


Note: you may need to restart the kernel to use updated packages.


In [48]:
!pip show python-dotenv

Name: python-dotenv
Version: 1.2.1
Summary: Read key-value pairs from a .env file and set them as environment variables
Home-page: 
Author: 
Author-email: Saurabh Kumar <me+github@saurabh-kumar.com>
License-Expression: BSD-3-Clause
Location: /Users/parineetaborah/anaconda3/envs/RAG_QA/lib/python3.11/site-packages
Requires: 
Required-by: pydantic-settings


In [1]:
import json
from typing import List

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
from groq import Groq

import os

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
openai_api_key = os.getenv("OPENAI_API_KEY")


In [50]:
class MultimodalRAG:
    def __init__(self, pdf_path:str):
        self.pdf_path = pdf_path
        self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        self.pdf_images_dir = f"./pdf_images/{self.pdf_name}/"
    
    def partition_pdf(self):
        elements = partition_pdf(filename=self.pdf_path,
                                 strategy = "hi_res",
                                 infer_table_structure=True,
                                 extract_image_block_types=["image"],
                                 extract_image_block_to_payload=True)
        
        print(f"Extracted {len(elements)} elements from the PDF.")
        return elements
    
    def create_chunks_by_title(self, elements):
        chunks = chunk_by_title(elements,
                                max_characters=3000,
                                new_after_n_chars=2400,
                                combine_text_under_n_chars=500)
        print(f"Chunked into {len(chunks)} sections based on titles.")
        return chunks
    
    def separate_chunk_contents(self, chunk):
        content_dict = {
            "text": chunk.text,
            "images":[],
            "tables":[],
            "types":["text"], 
            "page_number": [],
            "filename": self.pdf_name
            }
        
        if hasattr(chunk, "metadata") and hasattr(chunk.metadata, "orig_elements"):
            for el in chunk.metadata.orig_elements:
                element_type = type(el).__name__
                if el.category == "Table":
                    table_html = getattr(el.metadata, 'text_as_html', el.text)
                    content_dict["types"].append("table")
                    content_dict["tables"].append(table_html)

                elif element_type == "Image":
                    if hasattr(el, "metadata") and hasattr(el.metadata, "image_base64"):
                        image_data = el.metadata.image_base64
                        content_dict["images"].append(image_data)
                        content_dict["types"].append("image")

        content_dict["page_number"] = list(set(self.pdf_images_dir+ f"page_{e.metadata.page_number}.png" for e in chunk.metadata.orig_elements))
        
        content_dict["types"] = list(set(content_dict["types"]))
        return content_dict
    
    def ai_summary(self, text, tables, images):

        try:
            llm = ChatOpenAI(model="gpt-4o", temperature=0)

            prompt = f'''You are creating a searchable description for a document chunk. 
            
            CONTENTS:
            TEXT: {text}
'''

            if tables:
                prompt+= "TABLE:\n"
                for i, table in enumerate(tables):
                    prompt+=f'''Table{i+1}:\n{table}\n\n'''

            prompt+='''Generate a comprehensive, searchable description from the text, tables and images provided that covers:
            - Key topics and concepts discussed
            - Questions that this conetnt could answer
            - Important data points from the tables
            - Any notable relationships or insights
            - Visual content analysis(charts, diagrams, patterns in images)
            - Alternative search terms users might use

            Make it detailed and searchable - prioritize findability over brevity.

            SEARCHABLE DESCRIPTION:
            '''

            message_content = [{"type":"text", "text": prompt}]

            for image_base64 in images:
                message_content.append({"type":"image_url", 
                                        "image_url": {"url":f"data:image/jpeg;base64,{image_base64}"}})
                
            message = HumanMessage(content=message_content)
            response = llm.invoke([message])

            return response.content
        
        except Exception as e:
            print(f"Error generating summary: {e}")
            return f"{text[300:]}..."
            
    def create_document_langchain(self, chunks):
        langchain_document = []
        for idx, chunk in enumerate(chunks):
            print(f"Processing {idx}/{len(chunks)} chunk...")
            content_dict = self.separate_chunk_contents(chunk)

            if content_dict["images"] or content_dict["tables"]:
                try: 
                    print(f"Generating AI summary for mixed content chunk ...")
                    content = self.ai_summary(content_dict["text"],
                                            content_dict["tables"],
                                            content_dict["images"])
                    
                    print(f"Successfully generated AI summary")
                
                except Exception as e:
                    content = content_dict["text"]
                    print(f"Falling back to text content due to error: {e}")

            
            else:
                print("No mixed content - using text only.")
                content = content_dict["text"]

            doc = Document(
                page_content=content,
                metadata={
                "original_content": json.dumps({
                    "text": content_dict["text"],
                    "tables": content_dict["tables"],
                    "images": content_dict["images"],
                    "page_numbers": content_dict["page_number"],
                    "filename": content_dict["filename"]
                })
                })
            
            langchain_document.append(doc)
        print(f"Processed {len(langchain_document)} chunks.")
        return langchain_document
    
    def create_vector_store(self, documents, persist_directory = "./chroma_db"):
        embedding_model = OpenAIEmbeddings(model = 'text-embedding-3-small')

        print(f"CReatig vector store in directory: {persist_directory} ...")
        vector_store = Chroma.from_documents(
            documents,
            embedding=embedding_model,
            collection_name=self.pdf_name,
            persist_directory=persist_directory,
            collection_metadata= {"hnsw:space":"cosine"}
        )
        print("__Finished storing to vector datastore.__")
        return vector_store
    
    def retriever(self, query, vector_store, k=5):
        retriever = vector_store.as_retriever(search_kwargs={"k": k})
        chunks = retriever.invoke(query)
        return chunks
    
    def get_page_links(self, chunks):
        page_links=[]
        for chunk in chunks:
            text_dict = json.loads(chunk.metadata["original_content"])
            page_links.extend(text_dict["page_numbers"])
        return list(set(page_links))
    
    def generate_answer(self, chunks, query):
        try:
            llm = ChatOpenAI(model="gpt-4o", temperature=0)

            prompt = f'''Based on the following documents, generate a concise and accurate answer:\n\n
            
            CONTEXT:
            '''

            for i, chunk in enumerate(chunks):
                prompt+=f"--------------- Document {i+1} --------------\n"
                if "original_content" in chunk.metadata:
                    original_content = json.loads(chunk.metadata["original_content"])
                    prompt+=f"TEXT:\n{original_content['text']}\n\n"
                    if original_content["tables"]:
                        prompt+="TABLES:\n"
                        for j, table in enumerate(original_content["tables"]):
                            prompt+=f"Table {j+1}:\n{table}\n\n"

            prompt+="\n"

            prompt+='''Please provide a clear, comprehensive answer using the text, tables, and images above. If the documents don't contain sufficient information to answer the question, say "I don't have enough information to answer that question based on the provided documents."

            ANSWER:'''

            message_content = [{"type":"text", "text": prompt}]

            for chunk in chunks:
                if "original_content" in chunk.metadata:
                    original_content = json.loads(chunk.metadata["original_content"])
                    for image_base64 in original_content["images"]:
                        message_content.append({"type":"image_url", 
                                                "image_url": {"url":f"data:image/jpeg;base64,{image_base64}"}})


            message = HumanMessage(content=message_content)
            response = llm.invoke([message])
            page_links = self.get_page_links(chunks)
            answer = {"final_answer": response.content,
                      "page_links": page_links}

            return answer
        
        except Exception as e:
            print(f"Error generating answer: {e}")
            return "Sorry, couldn't generate an answer due to an error."

                

In [33]:
import io
import fitz


def convert_pdf_to_images(pdf_path: str) -> List[bytes]:
    images = []
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    os.makedirs(f"pdf_images/{pdf_name}/", exist_ok=True)
    print(pdf_name)
    with fitz.open(pdf_path) as doc:
        try:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                pix = page.get_pixmap()
                img_bytes = pix.tobytes("png")
                images.append(img_bytes)

                with open(f"pdf_images/{pdf_name}/page_{page_num + 1}.png", "wb") as img_file:
                    img_file.write(img_bytes)
        except Exception as e:
            print(f"Error converting page {page_num} to image: {e}")
    return images


In [5]:
pdf_path = "pdf/Attention-is-all-you-need.pdf"
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
convert_pdf_to_images(pdf_path)

Attention-is-all-you-need


[b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02d\x00\x00\x03\x18\x08\x02\x00\x00\x00\xea\xe4l\xe5\x00\x00\x00\tpHYs\x00\x00\x0e\xc4\x00\x00\x0e\xc4\x01\x95+\x0e\x1b\x00\x01~MIDATx\x9c\xec\x9d\x07\x98\xd5D\xdb\x86\xd5\xcf\xfe\x7fX\xc1^\xc1^\xb0\xa2X@\x05\xa5\x8a"\x88\x82\xa2\xa0\x88\xd8\x10\x10PA@z\x93\xa2\x88\xf4*E\x10\xa4\x08\xd2{g\xa9"\xbdI\xefH\x11\x04\x04\xcc\x7f\xe7\xcc~c\xc89{\xb2{\xb6\x02\xcf}qq%9\xc9d\xe6\x9d\x99\xf7\x997\x9b\xcc\x9c\xe1\x08!\x84\x10"*g\xa4w\x06\x84\x10B\x88\x8c\x8e\xc4R\x08!\x84\x08@b)\x84\x10B\x04 \xb1\x14B\x08!\x02\x90X\n!\x84\x10\x01H,\x85\x10B\x88\x00$\x96B\x08!D\x00\x12K!\x84\x10"\x00\x89\xa5\x10B\x08\x11\x80\xc4R\x08!\x84\x08@b)\x84\x10B\x04 \xb1\x14B\x08!\x02\x90X\n!\x84\x10\x01H,\x85\x10B\x88\x00$\x96B\x08!D\x00\x12K!\x84\x10"\x00\x89\xa5\x10B\x08\x11\x80\xc4R\x08!\x84\x08@b)\x84\x10B\x04 \xb1\x14B\x08!\x02\x90X\n!\x84\x10\x01H,\x85\x10B\x88\x00$\x96B\x08!D\x00\x12K!\x84\x10"\x00\x89\xa5\x10B\x08\x11\x80\xc4R\x08!\x84\x08@b)\x84\x10B\x04 \xb1\x14B\x

In [48]:
rag = MultimodalRAG(pdf_path="pdf/Attention-is-all-you-need.pdf")

In [7]:

elements = rag.partition_pdf()



The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extracted 220 elements from the PDF.


In [8]:
elements

[<unstructured.documents.elements.Text at 0x32c2fec90>,
 <unstructured.documents.elements.Text at 0x34c701b50>,
 <unstructured.documents.elements.Text at 0x32c0e5c90>,
 <unstructured.documents.elements.Text at 0x32be80090>,
 <unstructured.documents.elements.Text at 0x32be82650>,
 <unstructured.documents.elements.Header at 0x32c34cfd0>,
 <unstructured.documents.elements.Text at 0x31f9eadd0>,
 <unstructured.documents.elements.Text at 0x32c302290>,
 <unstructured.documents.elements.Text at 0x32c47e9d0>,
 <unstructured.documents.elements.Text at 0x34c703650>,
 <unstructured.documents.elements.Text at 0x34c703a10>,
 <unstructured.documents.elements.Text at 0x34c703c10>,
 <unstructured.documents.elements.Text at 0x34c700f90>,
 <unstructured.documents.elements.Text at 0x34c700d10>,
 <unstructured.documents.elements.Text at 0x34c703490>,
 <unstructured.documents.elements.NarrativeText at 0x32c4069d0>,
 <unstructured.documents.elements.Title at 0x32c2fe150>,
 <unstructured.documents.elements.Te

In [9]:
categories = set([el.category for el in elements])
categories

{'FigureCaption',
 'Footer',
 'Formula',
 'Header',
 'Image',
 'ListItem',
 'NarrativeText',
 'Table',
 'Title',
 'UncategorizedText'}

In [10]:
title = [el for el in elements if el.category == "Image"]
title[4].to_dict()

{'type': 'Image',
 'element_id': 'df2c354e8d29a4971237044723aa2304',
 'text': '<ped> <ped> <SOa> — 70” UOIUIdO == = uoluldo Aw — Aw ul ul Bulssiw Bulssiw ale « ae aM = aM yeum = yeum S| v2 S| sy si ysnf ysnf 3q° 3q Pinoys Pinoys uojeojdde Ss}! nq jopied 3q JO@AoU me) aul <ped> <SOa> uojuido Aw ul Bulssiw oe aM yeum S| su} ysnf 3q Pinoys uojeoydde si! ynq yooped 3q JOABU meq au <ped> <SOa> uoluldo Aw ul Bulssiw oe OM yeum S| Siu} ysnf 3q Pinoys uoyeodde si! ynq yooped 3q aul',
 'metadata': {'detection_class_prob': 0.8002366423606873,
  'coordinates': {'points': ((np.float64(339.7367858886719),
     np.float64(416.0276794433594)),
    (np.float64(339.7367858886719), np.float64(1635.7767333984375)),
    (np.float64(1386.38525390625), np.float64(1635.7767333984375)),
    (np.float64(1386.38525390625), np.float64(416.0276794433594))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-12-08T11:21:10',
  'filetype': 'application/pdf',
  'l

In [11]:
chunks = rag.create_chunks_by_title(elements)

Chunked into 25 sections based on titles.


In [12]:
chunks[0].to_dict()

{'type': 'CompositeElement',
 'element_id': '2d892f76-822f-4eae-a2ae-e2c648e87ec5',
 'text': '3\n\n2023\n\n2\n\n0\n\n2\n\ng u A 2 ] L C . s c [ 7 v 2 6 7 3 0 . 6 0\n\n7\n\n1\n\n:\n\nv\n\narXiv\n\ni\n\nX\n\nr\n\na\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani∗\n\nGoogle Brain\n\navaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗\n\nGoogle Research usz@google.com\n\nLlion Jones∗\n\nGoogle Research llion@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com',
 'metadata': {'file_directory': 'pdf',
  'filename': 'Attention-is-all-you-need.pdf',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'last_modified': '2025-12-08T11:21:10',
  'page_number':

In [13]:
langchain_documents = rag.create_document_langchain(chunks)


Processing 0/25 chunk...
No mixed content - using text only.
Processing 1/25 chunk...
No mixed content - using text only.
Processing 2/25 chunk...
No mixed content - using text only.
Processing 3/25 chunk...
No mixed content - using text only.
Processing 4/25 chunk...
Generating AI summary for mixed content chunk ...
Successfully generated AI summary
Processing 5/25 chunk...
No mixed content - using text only.
Processing 6/25 chunk...
Generating AI summary for mixed content chunk ...
Successfully generated AI summary
Processing 7/25 chunk...
No mixed content - using text only.
Processing 8/25 chunk...
No mixed content - using text only.
Processing 9/25 chunk...
No mixed content - using text only.
Processing 10/25 chunk...
No mixed content - using text only.
Processing 11/25 chunk...
Generating AI summary for mixed content chunk ...
Successfully generated AI summary
Processing 12/25 chunk...
No mixed content - using text only.
Processing 13/25 chunk...
No mixed content - using text only

In [14]:
langchain_documents[4].metadata

{'original_content': '{"text": "3 Model Architecture\\n\\nMost competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 35]. Here, the encoder maps an input sequence of symbol representations (x1,...,xn) to a sequence of continuous representations z = (z1,...,zn). Given z, the decoder then generates an output sequence (y1,...,ym) of symbols one element at a time. At each step the model is auto-regressive [10], consuming the previously generated symbols as additional input when generating the next.\\n\\n2\\n\\nOutput Probabilities Add & Norm Feed Forward Add & Norm Multi-Head Attention a, Add & Norm Add & Norm Feed Forward Nx | Cag Norm) Add & Norm Masked Multi-Head Multi-Head Attention Attention Se a, Lt Positional Positional Encoding @ \\u00a9 OY Encoding Input Output Embedding Embedding Inputs Outputs (shifted right)\\n\\nFigure 1: The Transformer - model architecture.\\n\\nThe Transformer follows this overall architecture using stacked self-attention 

In [36]:
db = rag.create_vector_store(langchain_documents)

CReatig vector store in directory: ./chroma_db ...
__Finished storing to vector datastore.__


In [49]:
q = "what is the attention dot product formula?"
retrieved_chunks = rag.retriever(query = q, vector_store=db)
answer  = rag.generate_answer(retrieved_chunks, query=q)   

answer

{'page_links': ['./pdf_images/Attention-is-all-you-need/page_5.png',
  './pdf_images/Attention-is-all-you-need/page_3.png',
  './pdf_images/Attention-is-all-you-need/page_4.png'],
 'final_answer': 'The provided documents describe two types of attention mechanisms used in neural networks: Scaled Dot-Product Attention and Multi-Head Attention.\n\n1. **Scaled Dot-Product Attention**:\n   - It involves queries, keys, and values with dimensions \\(d_k\\) and \\(d_v\\).\n   - The attention is computed by taking the dot product of queries and keys, scaling by \\(\\frac{1}{\\sqrt{d_k}}\\), and applying a softmax function to obtain weights. These weights are then used to compute a weighted sum of the values.\n   - This method is efficient due to optimized matrix multiplication and is faster than additive attention, especially for small \\(d_k\\).\n\n2. **Multi-Head Attention**:\n   - Instead of a single attention function, it uses multiple attention heads.\n   - Queries, keys, and values are li

In [31]:
text_dict = json.loads(retrieved_chunks[0].metadata["original_content"])
text_dict["page_numbers"]

['./pdf_images/Attention-is-all-you-need/page_4.png']

In [13]:
def export_chunks_to_json(chunks):
    export_list = []
    for idx, chunk in enumerate(chunks):
        content_dict = {
            "enhanced_data": chunk.page_content,
            "metadata": {"original_content" : json.loads(chunk.metadata.get("original_content", "{}"))}
        }
        export_list.append(content_dict)
    
    with open("retrieved_chunks.json", "w") as f:
        json.dump(export_list, f, indent=4)
    
    print(f"Exported {len(chunks)} chunks to retrieved_chunks.json")


In [14]:
export_chunks_to_json(langchain_documents)

Exported 314 chunks to retrieved_chunks.json


In [30]:
table = [el for el in elements if el.category == "Table"]

In [31]:
table[0].to_dict()

{'type': 'Table',
 'element_id': '6523cf5671e0ed47bfbad236ec646135',
 'text': 'What is an approved document? How is construction regulated in England? How do you comply with the Building Regulations? What do the Building Regulations cover? When must a building control body be notified? How to use this approved document Where to get further help Summary Arrangement of sections Management of premises Property protection Inclusive design Alternative approaches Purpose groups Mixed use buildings Intention General provisions Large dwellinghouses Extensions and material alterations Blocks of flats Student accommodation Sheltered housing Design and installation of systems Escape from the ground storey Escape from upper storeys a maximum of 4.5m above ground level Escape from upper storeys more than 4.5m above ground level General provisions Work on existing dwellinghouses Introduction General provisions Flats with upper storeys a maximum of 4.5m above ground level i i i i ii ii iii iv 1 1 1 2

In [43]:
!which python

/Users/parineetaborah/anaconda3/envs/RAG_QA/bin/python
