# Multimodal RAG (PDF with Images)

In [2]:
import fitz
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain_classic.chat_models import init_chat_model
from langchain_classic.prompts import PromptTemplate
from langchain_classic.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str): # If Path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL
        image = image_data

    inputs= clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        ## Normalize embeddings to unit vector
        features = features/features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()
    
def embed_text(text):
    """Embed text using CLIP"""
    inputs= clip_processor(
        text=text, 
        return_tensors="pt",
        padding = True,
        truncation=True,
        max_length=77 # CLIP's max token length
        )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        ## Normalize embeddings to unit vector
        features = features/features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [5]:
pdf_path="multimodal_sample.pdf"
doc = fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {} # Store actual image data for LLM

# Text Splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [6]:
doc

Document('multimodal_sample.pdf')

In [7]:
for i,page in enumerate(doc):
    text=page.get_text()
    if text.strip():
        temp_doc= Document(page_content=text, metadata={"page":i,"type":"text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding= embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    ## process images

    #1. Covvert PDF image to PIL format
    #2. Store as base64 for GPT-4V (which needs base64 images)
    #3. Create CLIP embedding for retrieval
    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # convert to PIL image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # create unique Identifier
            image_id= f"page_{i}_img_{img_index}"

            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id]= img_base64

            # Embed Using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            image_doc = Document(
                page_content=f"[Image ={image_id}]",
                metadata={"page":i,"type":"image","image_id":image_id}
            )
            all_docs.append(image_doc)
        except Exception as e:
            print(f"error processing {img_index} on page {i}: {e}")
            continue

doc.close()

In [8]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='BASICS \nOf  \nANTENNAS\nSHUBHENDU JOARDAR\nB.Tech. (Electronics, NIT Calicut)\nM.S. (Microwaves, IIT Madras) \nF.I.E.T.E. (IETE, India)\nPh.D. (Physics, University of Kalyani)'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image =page_0_img_0]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_1'}, page_content='[Image =page_0_img_1]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_2'}, page_content='[Image =page_0_img_2]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_3'}, page_content='[Image =page_0_img_3]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_4'}, page_content='[Image =page_0_img_4]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_5'}, page_content='[Image =page_0_img_5]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'p

In [9]:
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.00935077,  0.03095548,  0.00706517, ...,  0.0014922 ,
         0.03092174, -0.03133981],
       [-0.01013921, -0.01773097,  0.023527  , ...,  0.09584088,
        -0.01513974, -0.0133103 ],
       [ 0.01367843, -0.03702171,  0.02704564, ...,  0.08849805,
        -0.02249452,  0.02201668],
       ...,
       [ 0.00104644,  0.01290469, -0.01449649, ...,  0.05046619,
        -0.00209042,  0.00692558],
       [-0.03101584, -0.002242  , -0.01521717, ...,  0.09071334,
        -0.00698164, -0.02486183],
       [-0.01882173, -0.02554945,  0.00750832, ...,  0.07116207,
         0.01088675, -0.0119858 ]], shape=(712, 512), dtype=float32)

In [10]:
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs,embeddings_array)],
    embedding= None,
    metadatas=[doc.metadata for doc in all_docs]
)

vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x2329b60cd90>

In [11]:
llm= init_chat_model("groq:meta-llama/llama-4-maverick-17b-128e-instruct")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x00000232A1EF4650>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000023308261090>, model_name='meta-llama/llama-4-maverick-17b-128e-instruct', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [12]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images"""
    query_embedding= embed_text(query)

    results = vector_store.similarity_search_by_vector(
        embedding= query_embedding,
        k=k
    )

    return results

In [13]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images"""
    content = []

    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type")=="text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type")=="image"]

    if text_docs:
        text_content ="\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])

        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_content}\n"
            })
        
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
            "type": "text",
            "text": f"\n[Image from page {doc.metadata[['page']]}]:\n"
            })
            content.append({
                "type":"image_url",
                "image_url": {
                    "url":f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    content.append({
        "type":"text",
        "text":"\n\nPlease answer the question based on the provided text and images"
    })

    return HumanMessage(content=content)


In [14]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal rag"""
    context_docs = retrieve_multimodal(query, k=5)

    message = create_multimodal_message(query, context_docs)

    response = llm.invoke([message])

    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content)>100 else doc.page_content
            print(f"    - Text from page {page}: {preview}")
        else:
            print(f"    - Image from page {page}")
    print("\n")

    return response.content

In [15]:
if __name__ == "__main__":
    queries = ["What does the picture on page 8 tell about radiating near and far fields?",
               "Summarize the main findings from the document",
               "What visual elements are present in the document?"]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-"*50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"\nAnswer: {answer}")
        print("="*70)


Query: What does the picture on page 8 tell about radiating near and far fields?
--------------------------------------------------

Retrieved 5 documents:
    - Text from page 35: Assignment Problems-I
1. Explain the mechanism of radiation and reception using an antenna. 
2. Expl...
    - Text from page 1: should not aim to use this for destructive, 
non-scientific or non-educational purposes.
           ...
    - Text from page 39: THANK YOU
    - Text from page 38: Assignment Problems-IV
18. Explain the antenna reciprocity theorem and its implications 
towards rad...
    - Text from page 7: Radiating Near and Far Fields 
The field patterns generated by a 
radiating 
antenna 
vary 
with 
di...



Answer: ## Step 1: Understand the context of the question
The question is asking to interpret the information given on page 8 about radiating near and far fields, but the content of page 8 is not directly provided. However, we have information from page 7 that discusses the radiating near a