In [51]:
import fitz
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
import os
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [52]:
## Clip model
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

##Initialize the CLIP model and processor for unified embedding
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [53]:
clip_model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [54]:
## Embedding function

def embed_image(image_data):
    "Embed an image using the CLIP model."
    if isinstance(image_data, str):
        # If the input is a base64 string, decode it
        image=Image.open(image_data).convert("RGB")
    else:
        image=image_data
    inputs=clip_processor(images=image,return_tensors="pt") # Process the image and return pytorch tensors so that clip model can understand it
    with torch.no_grad(): # Disable gradient calculation for inference, saves memory and computation
        features=clip_model.get_image_features(**inputs) #gets the embeddings from the image using the CLIP model
        features=features/features.norm(dim=-1, keepdim=True) # Normalize the embeddings
        return features.squeeze().numpy()  # Return the features as a numpy array, basically converts from tensor to numpy array

def embed_text(text):
    "Embed text using the CLIP model."
    inputs=clip_processor(text=text,return_tensors="pt",padding=True,truncation=True,max_length=77) # Process the text and return pytorch tensors so that clip model can understand it
    with torch.no_grad(): 
        features=clip_model.get_text_features(**inputs) #gets the embeddings from the text using the CLIP model
        features=features/features.norm(dim=-1, keepdim=True) 
        return features.squeeze().numpy() 

In [55]:
## Process PDF
pdf_path="SUDOKO.pdf"
doc=fitz.open(pdf_path)
#Storage for all documents and embeddings

all_docs=[]
all_embeddings=[]
image_data_store={} #Store actual image data for LLM


#Text splitter to split the text into smaller chunks
splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)

In [56]:

for i, page in enumerate(doc):
    #process the text
    text=page.get_text()
    if text.strip():
        ## create temporary document for splitting
        temp_doc=Document(page_content=text,metadata={"page":i,"type":"text"})
        text_chunks=splitter.split_documents([temp_doc])
        # print(text_chunks)
        
        #Embed each chunks using CLIP
        for chunk in text_chunks:
            embedding=embed_text(chunk.page_content)
            all_docs.append(chunk)
            all_embeddings.append(embedding)
        # print(all_embeddings)
    
    ## process images
    ##Three Important Actions:

    ##Convert PDF image to PIL format
    ##Store as base64 for GPT-4V (which needs base64 images)
    ##Create CLIP embedding for retrieval

    for img_index, img in enumerate(page.get_images(full=True)): #Go through each image
        try:
            # print(img)
            xref = img[0] #Get the reference number of the image on the page
            # print(xref)
            base_image = doc.extract_image(xref) #contains info about the image like width, height, bytes
            # print(base_image)
            image_bytes = base_image["image"] #Taking the bytes of the image for processing
            #bytes are raw binary data of images
            
            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            print(pil_image)
            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"
            
            # Store image as base64 for later use with GPT-4V
            #Base64 is useful for sending images as text (e.g., to APIs or web apps
            buffered = io.BytesIO() #Temporarily hold image data
            pil_image.save(buffered, format="PNG") # Save the image to the buffer in PNG format
            # Convert to base64 string
            img_base64 = base64.b64encode(buffered.getvalue()).decode() # Gets the raw bytes from the buffer, encodes them into a base64 string (text format).
            image_data_store[image_id] = img_base64 #Stores the base64 string in a dictionary with a unique image_id as the key.
            
            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            
            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
            # print(all_docs)
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()
    
   

<PIL.Image.Image image mode=RGB size=558x587 at 0x278188ED470>
<PIL.Image.Image image mode=RGB size=563x568 at 0x277FC8AEF90>


In [57]:
# Bytes and Base 64
# Bytes are the raw binary data that computers use to store files 
# like images, audio, etc. They are not human-readable and may look
# like random characters if you try to print them.

#Base64 is a way to encode those bytes into a string of readable 
# characters (letters, numbers, +, /, =). 
# This makes it easy to send binary data as text 
# (for example, in JSON, HTML, or APIs).

In [58]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='EASY SUDOKO'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
 Document(metadata={'page': 1, 'type': 'text'}, page_content='MEDIUM SUDOKO'),
 Document(metadata={'page': 1, 'type': 'image', 'image_id': 'page_1_img_0'}, page_content='[Image: page_1_img_0]')]

In [59]:
# Create unified FAISS vector store with CLIP Embeddings

embeddings_array=np.array(all_embeddings) #Convert the list of embeddings to a numpy array


vector_store=FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # Use None since we already have embeddings
    metadatas=[doc.metadata for doc in all_docs])
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x2781872f950>

In [60]:
#Initialize the chat model
llm=init_chat_model("openai:gpt-4.1")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x00000278189102B0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000027818910180>, root_client=<openai.OpenAI object at 0x000002781872E8B0>, root_async_client=<openai.AsyncOpenAI object at 0x0000027818910640>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [61]:
def retrieve_multimodal(query,k=5):
    #Embed the query using CLIP
    query_embedding=embed_text(query)
    #Search in the vector store
    results=vector_store.similarity_search_by_vector(query_embedding,k=k)
    return results

In [62]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    # print(content)
    return HumanMessage(content=content)


In [None]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)
    
    # Create multimodal message
    message = create_multimodal_message(query, context_docs)
    
    # Get response from GPT-4V
    response = llm.invoke([message])
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
        
    return response.content

In [65]:

if __name__ == "__main__":
    # Example queries
    queries = [
        # "What does the chart on page 1 show about revenue trends?",
        # "Summarize the main findings from the document",
        # "What visual elements are present in the document?"
        "Complete both the sudoko puzzle in the document.",
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: Complete both the sudoko puzzle in the document.
--------------------------------------------------

Retrieved 4 documents:
  - Text from page 0: EASY SUDOKO
  - Text from page 1: MEDIUM SUDOKO
  - Image from page 0
  - Image from page 1


Answer: Sure! Let's complete both Sudoku puzzles based on the images you provided.

---

### EASY SUDOKU (Page 0)

Here's the original puzzle:
```
2 6 7 | _ 8 _ | 4 _ _
_ 3 _ | 2 _ 5 | _ _ _
9 _ 4 | _ _ _ | _ _ _
-------------------------
5 _ _ | 7 8 9 | 6 2 _
_ 7 6 | 4 _ _ | 1 _ 5
_ _ _ | _ 1 3 | 7 _ _
-------------------------
7 _ 5 | 8 _ 2 | 4 9 _
_ _ _ | _ _ 5 | _ 2 _
4 2 1 | _ _ 7 | 5 3 _
```

#### Completed Solution:
```
2 6 7 | 1 8 9 | 4 5 3
8 3 1 | 2 4 5 | 9 7 6
9 5 4 | 3 7 6 | 2 1 8
-------------------------
5 1 3 | 7 8 9 | 6 2 4
6 7 6 | 4 2 8 | 1 3 5
4 8 2 | 5 1 3 | 7 6 9
-------------------------
7 4 5 | 8 3 2 | 4 9 1
3 9 8 | 6 5 5 | 8 2 7
4 2 1 | 9 6 7 | 5 3 2
```

---

### MEDIUM SUDOKU (Page 1)

Here's the original puzzle:
```
_