In [2]:
# from frontend import *
# import tools
import fitz # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_ollama import OllamaLLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# !pip install frontend
# !pip install tools




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Loading CLIP Model
import os
from dotenv import load_dotenv
load_dotenv()

# os.environ

# Initializing the CLIP Model for Unified Embeddings from
# This model is responsible for converting text and images to embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
# Embedding functions
# For Images
def embed_image(image_data):
    """Embed image using CLIP Model"""
    if isinstance(image_data, str):
        image=Image.open(image_data).convert("RGB")
    else:
        image=image_data

    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()
    
# For Texts
def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text, 
        return_tensors="pt", 
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [5]:
# Process PDF

# pdf_path = "C:\Saurabh\Nakul_T4\Multi-Modal_RAG\multimodal_sample.pdf"
pdf_path = "multimodal_sample.pdf"
doc=fitz.open(pdf_path)

all_docs = []
all_embeddings = []
image_data_store = {} # for storing actual image data for LLM

# Text Splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [6]:
doc

Document('multimodal_sample.pdf')

In [7]:
for i, page in enumerate(doc):
    # process texts
    text = page.get_text()
    if text.strip():
        # create a temporary documnet for splitting
        temp_doc = Document(page_content=text, metadata={"page":i, "type":"text"})
        text_chunks = splitter.split_documents([temp_doc])

    # Embed each chunk using CLIP
    for chunk in text_chunks:
        embedding = embed_text(chunk.page_content)
        all_embeddings.append(embedding)
        all_docs.append(chunk)

    # process images
    # Three(3) important actions:
    # Convert PDF image to PIL Format
    # Store in base64 format for multi-modal LLM(which need base64 images)
    # Create CLIP embeddings for retrieval

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref=img[0]
            base_image=doc.extract_image(xref)
            image_bytes=base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"
            
            # Store image as base64 for later use with Multi-Modal LLM
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            
            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            
            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
            
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

In [8]:
all_embeddings

[array([-2.67243292e-03,  1.28300078e-02, -5.18314131e-02,  4.14879359e-02,
        -2.33941767e-02, -7.55864056e-03, -3.67659293e-02,  1.19710699e-01,
         8.52080807e-02,  2.05426570e-03, -1.11534707e-02, -1.29592167e-02,
         5.25014512e-02, -3.65391700e-03,  4.76078540e-02,  1.58372968e-02,
         2.03388296e-02,  4.35362011e-02, -3.29169002e-03,  2.03181449e-02,
         1.88025483e-03, -4.23493870e-02,  5.44100394e-03,  3.70935723e-02,
        -1.65623091e-02,  6.48645870e-03, -4.78012003e-02,  8.67485628e-03,
         5.88859506e-02, -3.21394131e-02,  4.32440154e-02,  9.65301972e-03,
        -4.47924202e-03, -1.94857828e-02, -3.63503024e-02, -1.23471608e-02,
        -2.17929389e-02, -1.99016184e-02,  8.09619799e-02, -3.32986601e-02,
        -2.38901339e-02, -3.96138802e-02, -1.27279945e-02,  3.50380838e-02,
        -2.52217259e-02,  2.00031837e-03,  1.49660185e-02, -2.31976416e-02,
        -6.86791167e-02, -5.25787182e-04, -2.22545844e-02, -1.04104038e-02,
        -1.9

In [9]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [10]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.00267243,  0.01283001, -0.05183141, ..., -0.00385086,
         0.02977715, -0.00010682],
       [ 0.01732343, -0.0132769 , -0.0242703 , ...,  0.08994047,
        -0.00272156,  0.03253041]], dtype=float32)

In [11]:
(all_docs, embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')],
 array([[-0.00267243,  0.01283001, -0.05183141, ..., -0.00385086,
          0.02977715, -0.00010682],
        [ 0.01732343, -0.0132769 , -0.0242703 , ...,  0.08994047,
         -0.00272156,  0.03253041]], dtype=float32))

In [12]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None, # Since we are using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [13]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x25fbe54e310>

In [14]:
# Initialize Multi-Modal LLM

llm = OllamaLLM(model="qwen3-vl:8b")
llm

OllamaLLM(model='qwen3-vl:8b')

In [15]:
def retrieve_multimodal(query, k=3):
    """Unified retrieval using CLIP embeddings for both text and images"""
    # Embed the user query using CLIP
    query_embedding = embed_text(query)

    # Search in unfied vector_store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    return results

In [None]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for Multi-Modal LLM."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)

In [None]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=3)
    
    # Create multimodal message
    message = create_multimodal_message(query, context_docs)
    
    # Get response from Multi-Modal LLM
    response = llm.invoke([message])
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response

In [18]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about revenue trends?"
        # "Summarize the main findings from the document",
        # "What visual elements are present in the document?"
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the chart on page 1 show about revenue trends?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: Based on the provided context, the chart referenced in **Page 0** (not Page 1, as the context explicitly labels the document section as "Page 0") shows the following revenue trends:  
- **Steady overall growth** across Q1, Q2, and Q3.  
- **Highest growth in Q3**, described as "exponential" due to global expansion.  
- **Q1**: Moderate increase driven by new product lines.  
- **Q2**: Outperformed Q1 due to successful marketing campaigns.  

**Note**: The context specifies that the chart is on **Page 0** (labeled as "Page 0" in the text), not Page 1. If the document includes a "Page 1," it is **not referenced** in the provided context, so no information about Page 1 is available. The answer is