In [1]:
pip install langchain langchain-community langchain-openai langchain-google-genai chromadb pypdf unstructured sentence-transformers streamlit fastapi uvicorn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"


In [None]:
import google.generativeai as genai
from PIL import Image

genai.configure(api_key="")
model = genai.GenerativeModel('gemini-1.5-pro')

def get_answer_from_visual_context(page_image_path, user_query):
    img = Image.open(page_image_path)
    
    # The prompt instructs the model to look at the visual elements specifically
    prompt = f"""
    You are an expert document analyst. Use the provided image of the document page 
    to answer the question: {user_query}. 
    If there are charts, tables, or diagrams, analyze them carefully.
    """
    
    response = model.generate_content([prompt, img])
    return response.text

In [14]:
pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-win_amd64.whl (18.4 MB)
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   -- ------------------------------------- 1.3/18.4 MB 7.4 MB/s eta 0:00:03
   ------ --------------------------------- 3.1/18.4 MB 8.4 MB/s eta 0:00:02
   --------- ------------------------------ 4.2/18.4 MB 7.0 MB/s eta 0:00:03
   ---------- ----------------------------- 4.7/18.4 MB 5.9 MB/s eta 0:00:03
   ------------ --------------------------- 5.8/18.4 MB 5.7 MB/s eta 0:00:03
   ------------- -------------------------- 6.3/18.4 MB 5.2 MB/s eta 0:00:03
   ---------------- ----------------------- 7.6/18.4 MB 5.2 MB/s eta 0:00:03
   -------------------- ------------------- 9.4/18.4 MB 5.7 MB/s eta 0:00:02
   ------------------------- -------------- 11.5/18.4 MB 6.2 MB/s eta 0:00:02
   ----------------------------- ---------- 13.6/18.4 MB 6.5 MB/s eta 0:00:01
   --

In [16]:
import fitz  # PyMuPDF

def pdf_to_images(pdf_path):
    doc = fitz.open(pdf_path)
    image_paths = []
    for i in range(len(doc)):
        page = doc.load_page(i)
        pix = page.get_pixmap()
        path = f"page_{i}.png"
        pix.save(path)
        image_paths.append(path)
    return image_paths

In [27]:
client = chromadb.Client()

try:
    client.delete_collection("multimodal_rag")
except:
    pass

collection = client.create_collection("multimodal_rag")


In [None]:
import fitz  # PyMuPDF
import io
from PIL import Image
import chromadb
import google.generativeai as genai

# =========================================================
# 1. Configure Gemini API
# =========================================================
genai.configure(api_key="")

# Fast multimodal summarization
indexing_model = genai.GenerativeModel("gemini-2.5-flash")

# Strong reasoning over retrieved pages
reasoning_model = genai.GenerativeModel("gemini-2.5-flash")

# =========================================================
# 2. PDF ‚Üí Images (IN MEMORY, NO FILE SYSTEM)
# =========================================================
def pdf_to_images_in_memory(pdf_path, zoom=2):
    """
    Converts PDF pages to PIL Images in memory.
    No disk writes ‚Üí no Windows permission errors.
    """
    doc = fitz.open(pdf_path)
    images = []

    for page in doc:
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
        img_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
        images.append(img)

    doc.close()
    return images

# =========================================================
# 3. Generate visual summaries for each page
# =========================================================
def generate_page_summaries(images):
    summaries = []
    print("üîç Generating visual summaries for indexing...")

    for idx, img in enumerate(images):
        prompt = (
            "Describe this document page in detail. "
            "Focus on titles, headers, charts, tables, "
            "and important numeric or textual data."
        )

        response = indexing_model.generate_content([prompt, img])
        summaries.append(response.text)

        print(f"‚úÖ Page {idx} summarized")

    return summaries

# =========================================================
# 4. Create Chroma Vector Store
# =========================================================
def create_vector_store(summaries):
    client = chromadb.Client()
    collection = client.create_collection(
        name="multimodal_rag",
        get_or_create=True
    )

    for i, summary in enumerate(summaries):
        collection.add(
            documents=[summary],
            metadatas=[{"page": i}],
            ids=[f"page_{i}"]
        )

    return collection


# =========================================================
# 5. Multimodal RAG Query
# =========================================================
def query_multimodal_rag(query, collection, images, top_k=2):
    results = collection.query(
        query_texts=[query],
        n_results=top_k
    )

    page_indices = [m["page"] for m in results["metadatas"][0]]
    retrieved_images = [images[i] for i in page_indices]
    context_text = "\n".join(results["documents"][0])

    prompt = f"""
User Question:
{query}

Context from Retrieved Page Summaries:
{context_text}

Instructions:
Use the attached document page images to answer precisely.
Carefully interpret any charts, tables, or numerical values.
"""

    response = reasoning_model.generate_content(
        [prompt, *retrieved_images]
    )

    return response.text, page_indices

# =========================================================
# 6. MAIN EXECUTION
# =========================================================
if __name__ == "__main__":
    PDF_FILE = "your_report.pdf"  # <-- Your PDF file

    # Step 1: Convert PDF ‚Üí Images
    images = pdf_to_images_in_memory(PDF_FILE)

    # Step 2: Generate page summaries
    summaries = generate_page_summaries(images)

    # Step 3: Create vector database
    vector_db = create_vector_store(summaries)

    # Step 4: Ask a question
    user_query = "What does the chart say about the year-over-year revenue growth?"

    answer, pages_used = query_multimodal_rag(
        user_query,
        vector_db,
        images
    )

    print("\n================ AI ANSWER ================\n")
    print(answer)

    print("\n============ PAGES USED ============\n")
    print(pages_used)


üîç Generating visual summaries for indexing...
‚úÖ Page 0 summarized
‚úÖ Page 1 summarized
‚úÖ Page 2 summarized


The chart (presented as a table) shows that year-over-year revenue growth accelerated:
*   In **2023**, revenue grew by **+30%**.
*   In **2024**, revenue grew by **+35%**, which was the highest increase and showed a strong upward trend.


[1, 2]


In [18]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.7-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.7-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   -------------------------------- ------- 1.6/2.0 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 7.8 MB/s  0:00:00
Installing collected packages: reportlab
Successfully installed reportlab-4.4.7
Note: you may need to restart the kernel to use updated packages.


In [19]:
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch

def create_dummy_pdf(filename="your_report.pdf"):
    c = canvas.Canvas(filename, pagesize=A4)
    width, height = A4

    # -------- Page 1 --------
    c.setFont("Helvetica-Bold", 18)
    c.drawString(1 * inch, height - 1 * inch, "Annual Revenue Report 2024")

    c.setFont("Helvetica", 12)
    c.drawString(1 * inch, height - 1.6 * inch, "Company: ABC Technologies")
    c.drawString(1 * inch, height - 2.0 * inch, "Prepared by: Finance Department")

    c.setFont("Helvetica", 11)
    c.drawString(1 * inch, height - 2.8 * inch,
                 "This report summarizes the year-over-year revenue growth.")
    
    c.drawString(1 * inch, height - 3.2 * inch,
                 "Overall revenue increased significantly compared to 2023.")

    c.showPage()

    # -------- Page 2 (Chart/Table Simulation) --------
    c.setFont("Helvetica-Bold", 16)
    c.drawString(1 * inch, height - 1 * inch, "Year-over-Year Revenue Growth")

    c.setFont("Helvetica-Bold", 12)
    c.drawString(1 * inch, height - 1.8 * inch, "Year")
    c.drawString(3 * inch, height - 1.8 * inch, "Revenue (USD)")
    c.drawString(5 * inch, height - 1.8 * inch, "Growth")

    c.setFont("Helvetica", 12)
    rows = [
        ("2022", "$2.0M", "-"),
        ("2023", "$2.6M", "+30%"),
        ("2024", "$3.5M", "+35%"),
    ]

    y = height - 2.4 * inch
    for year, revenue, growth in rows:
        c.drawString(1 * inch, y, year)
        c.drawString(3 * inch, y, revenue)
        c.drawString(5 * inch, y, growth)
        y -= 0.5 * inch

    c.drawString(
        1 * inch, y - 0.5 * inch,
        "The chart shows a strong upward trend with 35% growth in 2024."
    )

    c.showPage()

    # -------- Page 3 --------
    c.setFont("Helvetica-Bold", 16)
    c.drawString(1 * inch, height - 1 * inch, "Key Insights")

    c.setFont("Helvetica", 12)
    c.drawString(1 * inch, height - 1.8 * inch,
                 "‚Ä¢ Revenue growth accelerated year-over-year.")
    c.drawString(1 * inch, height - 2.2 * inch,
                 "‚Ä¢ 2024 saw the highest increase at 35%.")
    c.drawString(1 * inch, height - 2.6 * inch,
                 "‚Ä¢ Growth driven by new enterprise clients.")

    c.save()
    print(f"Dummy PDF created: {filename}")

if __name__ == "__main__":
    create_dummy_pdf()


Dummy PDF created: your_report.pdf


In [30]:
import streamlit as st
import os
import fitz
import google.generativeai as genai
from PIL import Image
import chromadb

# --- CONFIGURATION ---
st.set_page_config(page_title="Gemini Multimodal RAG", layout="wide")
st.title("üìÑ Smart Research Assistant")
st.subheader("Analyze Text, Tables, and Charts using Gemini 1.5 Pro")

# Sidebar for API Key
with st.sidebar:
    api_key = st.text_input("Enter Google AI Studio API Key", type="password")
    if api_key:
        genai.configure(api_key=api_key)
        indexing_model = genai.GenerativeModel('gemini-1.5-flash')
        reasoning_model = genai.GenerativeModel('gemini-1.5-pro')

# --- HELPER FUNCTIONS ---
def get_pdf_images(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    image_paths = []
    if not os.path.exists("temp_images"):
        os.makedirs("temp_images")
    
    for i in range(len(doc)):
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        path = f"temp_images/page_{i}.png"
        pix.save(path)
        image_paths.append(path)
    return image_paths

# --- UI LOGIC ---
uploaded_file = st.file_uploader("Upload a complex PDF (Financial reports, Research papers)", type="pdf")

if uploaded_file and api_key:
    if 'vector_db' not in st.session_state:
        with st.status("Processing Document..."):
            # 1. Convert PDF to Images
            img_paths = get_pdf_images(uploaded_file)
            
            # 2. Indexing (Generating Summaries)
            summaries = []
            for path in img_paths:
                img = Image.open(path)
                res = indexing_model.generate_content(["Summarize this page accurately for a search engine.", img])
                summaries.append(res.text)
            
            # 3. Store in ChromaDB
            client = chromadb.Client()
            collection = client.create_collection(name=f"col_{uploaded_file.name[:5]}")
            for i, s in enumerate(summaries):
                collection.add(documents=[s], metadatas=[{"path": img_paths[i]}], ids=[str(i)])
            
            st.session_state.vector_db = collection
            st.success("Document Indexed!")

    # --- CHAT INTERFACE ---
    query = st.chat_input("Ask about a chart, table, or specific data...")
    
    if query:
        with st.chat_message("user"):
            st.write(query)
        
        with st.chat_message("assistant"):
            # Retrieval
            results = st.session_state.vector_db.query(query_texts=[query], n_results=1)
            best_page_path = results['metadatas'][0][0]['path']
            
            # Generation
            img_to_analyze = Image.open(best_page_path)
            prompt = f"Using the provided document image, answer: {query}"
            response = reasoning_model.generate_content([prompt, img_to_analyze])
            
            # Display result
            st.write(response.text)
            st.divider()
            st.caption("Reference Page Used:")
            st.image(img_to_analyze, width=400)

elif not api_key:
    st.info("Please enter your API key in the sidebar to begin.")

2025-12-31 17:27:01.808 
  command:

    streamlit run c:\Users\NIKHIL\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-12-31 17:27:01.813 Session state does not function when running a script without `streamlit run`
