In [None]:
# Core dependencies
!pip install PyMuPDF
!pip install faiss-cpu
!pip install pillow torch numpy sklearn

# LangChain (core + community)
!pip install langchain langchain-core langchain-community langchain_google_genai

# Hugging Face Transformers
!pip install transformers

# Optional: if you want Hugging Face Hub access
!pip install huggingface_hub


[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m


In [24]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [11]:
from transformers import AutoModel, AutoProcessor

model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

In [12]:
def embed_image(image_data):
    """Embed image using SigLip"""
    if isinstance(image_data, str):
        image = Image.open(image_data).convert("RGB")
    else:
        image = image_data

    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        features = model.get_image_features(**inputs)
        # Normalize embedding to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

def embed_text(text_data):
    """Embed image using SigLip"""
    inputs = processor(
        text=text_data,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=64 # Max token length
    )

    with torch.no_grad():
        features = model.get_text_features(**inputs)
        # Normalize embedding to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [14]:
pdf_path = "multimodal_sample.pdf"
doc = fitz.open(pdf_path)
# Storage for all docs and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [15]:
doc

Document('multimodal_sample.pdf')

In [16]:
for i,page in enumerate(doc):
    text = page.get_text()
    if text.strip():
        temp_doc = Document(page_content=text, metadata={"page":i, "type":"text"})
        text_chunk = splitter.split_documents([temp_doc])

        for chunk in text_chunk:
            embeddings = embed_text(chunk.page_content)
            all_embeddings.append(embeddings)
            all_docs.append(chunk)

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

In [17]:
all_embeddings

[array([-4.80444171e-03, -4.43842337e-02,  3.40856686e-02, -5.87923545e-03,
         4.45601577e-03, -9.58190206e-03,  1.05223861e-02, -1.28613897e-02,
        -2.10767500e-02, -3.67622077e-02,  7.33973039e-03, -3.27888094e-02,
        -2.34855395e-02,  2.66038105e-02,  2.10805666e-02,  1.47640733e-02,
        -3.07694357e-02,  2.64615752e-02,  2.60461681e-02, -7.36000715e-04,
         8.61731917e-03, -2.55103000e-02, -1.43605620e-02, -9.90797020e-03,
        -3.99926677e-03,  1.93253197e-02, -4.21913603e-04, -2.57719215e-03,
         7.26880180e-03, -1.63431037e-02, -2.57167080e-03, -1.73817645e-03,
         4.20846939e-02,  3.65897678e-02, -2.46989913e-02,  2.12527383e-02,
         1.63387991e-02, -2.87898332e-02,  3.31795439e-02, -2.16645319e-02,
         1.50887528e-03, -2.78855301e-02, -2.20149066e-02, -1.64730623e-02,
        -1.94181781e-02, -2.26013698e-02,  3.04779466e-02,  2.12310888e-02,
         6.43275585e-03,  7.67302467e-03, -3.10828933e-03,  1.59335788e-02,
        -8.9

In [18]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [19]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.00480444, -0.04438423,  0.03408567, ..., -0.00219321,
         0.05540271, -0.01406366],
       [ 0.0145873 , -0.01813984, -0.03447667, ..., -0.00179562,
         0.02144864,  0.00956382]], dtype=float32)

In [28]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x7d2843ec6db0>

In [40]:
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3)
llm

ChatGoogleGenerativeAI(model='models/gemini-2.5-flash', google_api_key=SecretStr('**********'), temperature=0.3, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x7d27f32fc9b0>, default_metadata=(), model_kwargs={})

In [41]:
def retrieve_multimodal(query, k=5):
  """Unified retrival using Google embedding for both text and images"""
  # Embed query using SigLip
  query_embedding = embed_text(query)

  # Search in unified vector store
  results = vector_store.similarity_search_by_vector(
      embedding=query_embedding,
      k=k
)

  return results

In [42]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for Gemini."""
    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)

In [43]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from LLM
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [44]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about revenue trends?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the chart on page 1 show about revenue trends?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The chart on page 0 (which is likely referred to as page 1 in the question) shows a steady increase in revenue across three periods (implied as Q1, Q2, and Q3 by the accompanying text). The first bar (blue) is the shortest, the second bar (green) is taller, and the third bar (red) is the tallest, visually representing that revenue grew, with the highest revenue recorded in the third period.

Query: Summarize the main findings from the document
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The document summariz