Multi Model Rag(Pdf+Images)

In [1]:
pip install PyMuPDF


Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install langchain



In [4]:
pip install --upgrade langchain langchain-core


Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install langchain_community



In [6]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS


In [7]:
pip install grpcio


Note: you may need to restart the kernel to use updated packages.


In [8]:
import google.generativeai as genai

In [9]:
## CLIP MODEL
import os
from dotenv import load_dotenv
load_dotenv()

# set the environment
os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
model = genai.GenerativeModel("gemini-1.5-flash")

## initialize CLIP model for unified text and image embeddings
## this model is responsible for generating embeddings for both text and images
clip_model= CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()  # Set the model to evaluation mode



CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [10]:
### Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If image is given aspath
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image(if it is image or base64 data)
        image = image_data
    
    inputs=clip_processor(images=image,return_tensors="pt")# Process the image for CLIP and convert to tensors
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector,every image dimensions will be diffrent we are normalizing it to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()# Convert to numpy array for consistency with text embeddings
    
def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text, 
        return_tensors="pt", 
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length 
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()


In [11]:
## Process PDF
pdf_path="C:\\Users\\my\\Downloads\\multimodal_sample.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)


In [12]:

doc

Document('C:\Users\my\Downloads\multimodal_sample.pdf')

In [13]:
# In the below codes,we are processing the text and images in the pdf document and then creating chunks of text and images,and then embedding them using CLIP for retrieval of each chunk.
# and we will store the image data as base64 for later use with gemini-1.5-flash
for i,page in enumerate(doc):
    ## process text
    text=page.get_text()
    if text.strip():# remove empty text pages
        ##create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)



    ## process images
    ##Three Important Actions:

    ##Convert PDF image to PIL format
    ##Store as base64 for GPT-4V (which needs base64 images)
    ##Create CLIP embedding for retrieval

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            
            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"# Unique ID for each image based on page and index
            
            # Store image as base64 for later use with gemini-1.5-flash
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            
            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            
            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
            
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()


In [14]:
all_embeddings


[array([-2.67246389e-03,  1.28299538e-02, -5.18313907e-02,  4.14879471e-02,
        -2.33942270e-02, -7.55863590e-03, -3.67659368e-02,  1.19710743e-01,
         8.52081031e-02,  2.05426919e-03, -1.11534707e-02, -1.29592344e-02,
         5.25014661e-02, -3.65395262e-03,  4.76078540e-02,  1.58372913e-02,
         2.03388222e-02,  4.35362346e-02, -3.29168839e-03,  2.03181524e-02,
         1.88023748e-03, -4.23493721e-02,  5.44101652e-03,  3.70935909e-02,
        -1.65623091e-02,  6.48648385e-03, -4.78012115e-02,  8.67485069e-03,
         5.88859469e-02, -3.21393870e-02,  4.32440080e-02,  9.65301227e-03,
        -4.47925227e-03, -1.94858033e-02, -3.63502689e-02, -1.23471674e-02,
        -2.17929296e-02, -1.99016202e-02,  8.09619948e-02, -3.32986712e-02,
        -2.38901097e-02, -3.96138914e-02, -1.27280056e-02,  3.50381061e-02,
        -2.52217092e-02,  2.00030603e-03,  1.49660362e-02, -2.31976788e-02,
        -6.86791390e-02, -5.25758544e-04, -2.22545750e-02, -1.04103908e-02,
        -1.9

In [15]:

all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [16]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array


array([[-0.00267246,  0.01282995, -0.05183139, ..., -0.00385086,
         0.02977718, -0.00010684],
       [ 0.01732344, -0.01327689, -0.0242703 , ...,  0.08994047,
        -0.00272156,  0.03253041]], dtype=float32)

In [17]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')],
 array([[-0.00267246,  0.01282995, -0.05183139, ..., -0.00385086,
          0.02977718, -0.00010684],
        [ 0.01732344, -0.01327689, -0.0242703 , ...,  0.08994047,
         -0.00272156,  0.03253041]], dtype=float32))

In [18]:
pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [19]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings,we already have embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x1a3f2977260>

In [20]:
import google.generativeai as genai
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
llm = genai.GenerativeModel("gemini-1.5-flash")
llm


genai.GenerativeModel(
    model_name='models/gemini-1.5-flash',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

In [22]:
# it is just like a simple retrieval function that uses CLIP embeddings for both text and images
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    
    return results


In [23]:
# here we are creating a message that can be used with GPT-4V or Gemini-1.5-Flash
# This message will contain both text and images, formatted appropriately for multimodal models.
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)


In [25]:
# Convert LangChain human message to Gemini parts
# This function extracts text and image parts from a LangChain human message
# storing images as base64 for Gemini-1.5-Flash compatibility
import base64

def langchain_to_gemini_parts(human_message):
    parts = []
    for item in human_message.content:
        if item["type"] == "text":
            parts.append(item["text"])
        elif item["type"] == "image_url":
            # Extract base64 part from data URI
            b64_data = item["image_url"]["url"].split(",")[1]
            img_bytes = base64.b64decode(b64_data)
            parts.append({"mime_type": "image/png", "data": img_bytes})
    return parts


In [26]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)
    
    # Create multimodal message (LangChain HumanMessage)
    message = create_multimodal_message(query, context_docs)
    
    # Convert to Gemini format
    gemini_input = langchain_to_gemini_parts(message)
    
    # Call Gemini
    response = llm.generate_content(gemini_input)
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.text


In [27]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [28]:
pip install grpcio --upgrade


Note: you may need to restart the kernel to use updated packages.


In [29]:
pip install google-generativeai --upgrade


Note: you may need to restart the kernel to use updated packages.


In [30]:
pip install protobuf --upgrade


Collecting protobuf
  Using cached protobuf-6.32.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Using cached protobuf-6.32.1-cp310-abi3-win_amd64.whl (435 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.5
    Uninstalling protobuf-5.29.5:
      Successfully uninstalled protobuf-5.29.5
Successfully installed protobuf-6.32.1
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
googleapis-common-protos 1.66.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 6.32.1 which is incompatible.
google-ai-generativelanguage 0.6.15 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 6.32.1 which is incompatible.
grpcio-status 1.70.0 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 6.32.1 which is incompatible.
proto-plus 1.26.0 requires protobuf<6.0.0dev,>=3.19.0, but you have protobuf 6.32.1 which is incompatible.
streamlit 1.32.0 requires packaging<24,>=16.8, but you have packaging 24.2 which is incompatible.
streamlit 1.32.0 requires protobuf<5,>=3.20, but you have protobuf 6.32.1 which is incompatible.
tensorboard 2

In [31]:
pip install google-api-core google-auth google-cloud-core --upgrade


Collecting protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 (from google-api-core)
  Using cached protobuf-5.29.5-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Using cached protobuf-5.29.5-cp310-abi3-win_amd64.whl (434 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.32.1
    Uninstalling protobuf-6.32.1:
      Successfully uninstalled protobuf-6.32.1
Successfully installed protobuf-5.29.5
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.32.0 requires packaging<24,>=16.8, but you have packaging 24.2 which is incompatible.
streamlit 1.32.0 requires protobuf<5,>=3.20, but you have protobuf 5.29.5 which is incompatible.
tensorboard 2.17.0 requires protobuf!=4.24.0,<5.0.0,>=3.19.6, but you have protobuf 5.29.5 which is incompatible.
tensorflow-intel 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.5 which is incompatible.


In [32]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about revenue trends?"
    ]
for i, query in enumerate(queries, 1):
    print(f"\n[{i}/{len(queries)}] Query: {query}")
    print("-" * 50)
    answer = multimodal_pdf_rag_pipeline(query)
    print(f"Answer: {answer}")
    print("=" * 70)

    



[1/1] Query: What does the chart on page 1 show about revenue trends?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The chart on page 0 shows a steady increase in revenue across three quarters (Q1, Q2, and Q3).  Revenue was lowest in Q1, increased in Q2, and saw the highest growth in Q3.



In [33]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "Summarize the main findings from the document?"
   ]
for i, query in enumerate(queries, 1):
    print(f"\n[{i}/{len(queries)}] Query: {query}")
    print("-" * 50)
    answer = multimodal_pdf_rag_pipeline(query)
    print(f"Answer: {answer}")
    print("=" * 70)

    



[1/1] Query: Summarize the main findings from the document?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: Revenue showed steady growth across Q1, Q2, and Q3, with the highest growth occurring in Q3.  Q1 saw moderate growth attributed to new product lines. Q2 exceeded Q1's performance due to successful marketing campaigns.  Q3 experienced exponential growth, driven by global expansion.



In [37]:
if __name__ == "__main__":
    # Example queries
    queries = [
        
        "What visual elements are present in the document?"
    ]
for i, query in enumerate(queries, 1):
    print(f"\n[{i}/{len(queries)}] Query: {query}")
    print("-" * 50)
    answer = multimodal_pdf_rag_pipeline(query)
    print(f"Answer: {answer}")
    print("=" * 70)

    



[1/1] Query: What visual elements are present in the document?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The document contains a bar chart.  The chart uses three vertically-oriented bars of different heights and colors to represent data: blue, green, and red.  The bars increase in height from left to right.

