# Math-RAG: Interactive Research Paper Analysis
Navigating mathematics research papers often leads to a "citation rabbit hole." When a proof relies on a specific Lemma or Theorem from a cited work, researchers must manually find, download, and search through external papers‚Äîa process that breaks cognitive flow.

Math-RAG is an intelligent Retrieval-Augmented Generation solution designed to make the citation network interactive. By indexing both the primary paper and its references, it provides immediate access to the technical details you need, exactly when you need them.

In [None]:
# ==========================================
# 1. Standard Library Imports
# ==========================================
import json
import os
import re
from pathlib import Path
from typing import Optional, List

# ==========================================
# 2. Environment & Network Imports
# ==========================================
import requests
from httpx import get
from dotenv import load_dotenv

# ==========================================
# 3. Document Processing (PDFs & Parsing)
# ==========================================
import fitz  # PyMuPDF for basic PDF manipulation
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# ==========================================
# 4. AI & LangChain Framework
# ==========================================
from pydantic import BaseModel
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

# Vector Store & Embeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

# ==========================================
# 5. Specialized Research & Academic Tools
# ==========================================
import arxiv
from rapidfuzz import fuzz
from unpywall.utils import UnpywallCredentials
from unpywall import Unpywall

# ==========================================
# 6. Configuration & Initialization
# ==========================================

# Load environment variables (API Keys, etc.)
load_dotenv()

# Set up Unpywall for open-access paper retrieval
UnpywallCredentials('sarvagya07jain@gmail.com')

# Constants
CROSSREF_API = "https://api.crossref.org"

# Directory Setup
# Note: Using Pathlib is generally safer for cross-platform paths
PROJECT_DIR = Path("/Users/sarvagyajain/Downloads/Programming/math-papers-rag")
os.chdir(PROJECT_DIR)

In [None]:
def extract_references_from_pdf(pdf_path):
    """
    Opens a PDF and locates the start of the bibliography section.
    Returns all text following the 'References' or 'Bibliography' header.
    """
    doc = fitz.open(pdf_path)
    # Extract text from all pages and join into a single string
    text = "".join(p.get_text() for p in doc)

    # Search for the headers commonly used in academic papers
    match = re.search(
        r"\nreferences\n|\nbibliography\n",
        text,
        flags=re.IGNORECASE
    )

    if not match:
        raise ValueError("References section not found - check if the PDF uses a different header.")

    # Return only the text starting from the end of the matched header
    return text[match.end():]

def split_references(ref_text):
    """
    Attempts to split a block of reference text into individual citations 
    using common academic numbering and labeling patterns.
    """
    patterns = [
        r"\n\s*\[\d+\]\s*",       # Matches [1], [2], etc.
        r"\n\s*\d+\.\s*",         # Matches 1., 2., etc.
        r"\n\s*\[[A-Z]{2,}\d{2}\]\s*" # Matches alphanumeric labels like [SJ24]
    ]

    for p in patterns:
        parts = re.split(p, ref_text)
        # Heuristic: If we found more than 3 parts, the pattern likely worked.
        # We also filter for length (>50 chars) to ignore noise or page numbers.
        if len(parts) > 3:
            return [r.strip() for r in parts if len(r.strip()) > 50]

    # Fallback: If no pattern fits, split by newline and filter for longer strings.
    return [r.strip() for r in ref_text.split("\n") if len(r.strip()) > 80]

def normalize_reference_text(ref):
    """
    Cleans up a raw reference string by removing excessive whitespace, 
    new lines, and potential page artifacts.
    """
    # Replace all whitespace (tabs, multiple spaces, newlines) with a single space
    ref = re.sub(r"\s+", " ", ref)
    ref = ref.replace("\n", " ")
    ref = ref.strip(" .;,")

    # Filter out potential artifacts (e.g., page headers or footers) 
    # that are too short to be a valid citation.
    if len(ref.split()) < 6:
        return None

    return ref

In [None]:
def manual_title_extraction(reference):
    if not reference:
        return None

    # 1. PRE-PROCESSING: Clean up the raw string
    # Replace newlines with spaces and handle hyphenated line breaks (mag-nitude -> magnitude)
    ref = reference.replace('\n', ' ')
    ref = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', ref) 
    
    # Remove leading citation numbers like [12] or 12.
    ref = re.sub(r'^\[?\d+\]?\s*', '', ref)

    # 2. STRATEGY: Split into candidates
    # We split by common delimiters: Periods, Commas, and Semicolons
    candidates = re.split(r'[.,;]\s+', ref)
    
    best_candidate = None
    highest_score = -100

    # Markers that suggest a segment is NOT a title
    forbidden_markers = {
        'proc', 'vol', 'no', 'pp', 'pages', 'journal', 'theory', 'press', 
        'university', 'arxiv', 'preprint', 'appear', 'edited', 'eds', 'acta', 
        'annals', 'math', 'soc', 'inst', 'conf', 'berkeley', 'cambridge', 'ann'
    }
    
    # Math markers that increase confidence
    math_markers = {
        'integers', 'prime', 'factors', 'short', 'intervals', 'smooth', 
        'analytic', 'riemann', 'hypothesis', 'elliptic', 'curves', 'asymptotic',
        'expansion', 'distribution', 'function'
    }

    for segment in candidates:
        segment = segment.strip().strip(".,()[]")
        words = segment.split()
        word_count = len(words)
        lower_seg = segment.lower()

        if word_count < 3:
            continue
            
        score = 0

        # --- Scoring Logic ---
        
        # Length: Math titles in your list are usually 5-15 words
        if 5 <= word_count <= 18:
            score += 25
        elif 3 <= word_count < 5:
            score += 5

        # Content: Reward math-heavy terminology
        matches = sum(1 for m in math_markers if m in lower_seg)
        score += (matches * 8)

        # Content: Penalize metadata words (Journal names, "to appear", etc.)
        # If the segment IS the journal (e.g., "Int. J. Number Theory"), penalize heavily
        if any(marker in lower_seg for marker in forbidden_markers):
            score -= 40
            
        # Penalize segments that look like author lists (e.g., "A. Balog" or "H. W. Lenstra, Jr.")
        if re.search(r'\b[A-Z]\.\s', segment) or lower_seg.endswith(' jr'):
            score -= 30

        # Heuristic: Titles often start with "On ", "A ", "The ", "An "
        if re.match(r'^(on|a|the|an)\s', lower_seg):
            score += 15

        if score > highest_score:
            highest_score = score
            best_candidate = segment

    # Final polish: strip trailing punctuation often left by the split
    return best_candidate.strip() if best_candidate and highest_score > 0 else None

def title_cleanup_manual(raw_title):
    """
    Sanitizes a title string by removing XML/MathML tags, 
    illegal file characters, and normalizing whitespace.
    """
    if not raw_title:
        return None

    # 1. Remove MathML tags (e.g., <mml:math>...) frequently found in 
    # metadata from Crossref or PubMed.
    clean_title = re.sub(r'<mml:math[^>]*>.*?<\/mml:math>', '', raw_title, flags=re.DOTALL)
    
    # 2. Standardize spacing: Replace newlines with a single space
    clean_title = clean_title.replace('\n', ' ')
    
    # 3. File System Safety: Remove characters that are illegal in filenames 
    # (useful if you plan to save the PDF using this title).
    clean_title = re.sub(r'[\\/:*?"<>|]', '', clean_title)
    
    # 4. Final Polish: Collapse multiple spaces into one and lowercase
    clean_title = re.sub(r'\s+', ' ', clean_title).strip()
    
    return clean_title.lower()

In [None]:
#optional block of code for title cleanup using llm.

load_dotenv()

llm = ChatGoogleGenerativeAI(
    model="gemini-3-pro-preview",
    temperature=0,
    max_output_tokens=None,
)
class CleanReferences(BaseModel):
    references: List[str]

prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are cleaning mathematical bibliography entries.\n"
     "You MUST NOT invent references.\n"
     "You MUST preserve all factual content.\n"
     "You may only normalize formatting, punctuation, and spacing.\n"
     "If an entry is not a real bibliographic reference, REMOVE it.\n"
     "Return references in MathSciNet-like style."
    ),
    ("user",
     "Here are raw references extracted from a math paper:\n\n"
     "{refs}\n\n"
     "Return a JSON object with key 'references'."
    )
])

def llm_clean_references_langchain(raw_refs, llm):
    parser = JsonOutputParser(pydantic_object=CleanReferences)

    chain = prompt | llm | parser

    result = chain.invoke({
        "refs": "\n".join(f"- {r}" for r in raw_refs)
    })

    return result["references"]


class ReferenceMetadata(BaseModel):
    title: Optional[str]
    authors: Optional[List[str]]

metadata_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You extract bibliographic metadata from mathematical references.\n"
     "You MUST NOT invent missing information.\n"
     "If information is unclear, return null.\n"
     "Do NOT guess titles or authors.\n"
     "Return structured JSON only."
    ),
    ("user",
     "Reference:\n{ref}\n\n"
     "Extract metadata using the provided schema."
    )
])

def llm_extract_reference_metadata(ref, llm):
    parser = JsonOutputParser(pydantic_object=ReferenceMetadata)
    chain = metadata_prompt | llm | parser

    return chain.invoke({"ref": ref})


In [None]:
def search_arxiv_pdf(title, path="./pdfs"):
    """
    Searches arXiv for a paper by title, validates the match using fuzzy string 
    comparison, and downloads the PDF if it meets a 90% similarity threshold.
    """
    # 1. Setup download directory
    full_path = PROJECT_DIR / path
    full_path.mkdir(parents=True, exist_ok=True)
    
    # 2. Initialize arXiv client and search
    client = arxiv.Client()
    # We search for the top 5 results to account for similar titles
    search = arxiv.Search(query=f"{title}", max_results=5)

    best_score = 0
    best_pdf = None

    # 3. Identify the most relevant result
    for result in client.results(search):
        # fuzzy logic handles slight punctuation or casing differences
        score = fuzz.ratio(title.lower(), result.title.lower())
        
        # We only consider high-confidence matches (>90%)
        if score > best_score and score > 90:
            best_score = score
            best_pdf = result
            
    # 4. Download if a confident match was found
    if best_pdf:
        # Generate a clean filename using your title_cleanup helper
        safe_filename = f"{title_cleanup_manual(best_pdf.title)}.pdf"
        download_dest = full_path / safe_filename
        
        # Use the arxiv library's built-in download method
        best_pdf.download_pdf(dirpath=str(full_path), filename=safe_filename)
        print(f"Successfully downloaded: {safe_filename} (Score: {best_score})")
    else:
        print(f"No high-confidence match found for: {title}")

In [None]:
def download_pdf(url, filename, path="./pdfs"):
    """
    Downloads a PDF from a direct URL (useful for Crossref/Unpywall links).
    """
    # 1. Resolve target directory
    full_path = PROJECT_DIR / path
    full_path.mkdir(parents=True, exist_ok=True)
    target_file = full_path / filename

    try:
        # 2. Fetch the content with a 20-second timeout
        r = requests.get(url, timeout=20)
        
        if r.status_code != 200:
            print(f"Failed to download. Status code: {r.status_code}")
            return None

        # 3. Write binary content to file
        with open(target_file, "wb") as f:
            f.write(r.content)
        
        return str(target_file)

    except Exception as e:
        print(f"An error occurred during download: {e}")
        return None

In [None]:
def find_doi_by_title(title, rows=3):
    """
    Queries the Crossref API to find the most likely DOI for a given paper title.
    
    Returns the DOI of the top result based on Crossref's internal 
    relevance ranking.
    """
    # Crossref API parameters: 
    # query.title searches the title field specifically
    # rows limits the result set (we usually only need the top match)
    params = {
        "query.title": title,
        "rows": rows
    }

    try:
        r = requests.get(f"{CROSSREF_API}/works", params=params, timeout=10)
        r.raise_for_status() # Raise an error for 4xx or 5xx responses

        # Navigate the JSON response structure
        items = r.json()["message"]["items"]

        if not items:
            return None

        # Crossref sorts by relevance; we assume the first item is the best match
        return items[0].get("DOI")
    
    except Exception as e:
        print(f"Error finding DOI for '{title}': {e}")
        return None

def get_title_from_doi(doi):
    """
    Performs a reverse lookup: takes a DOI and retrieves the official 
    title of the paper from Crossref.
    """
    # Direct endpoint for a specific work
    url = f"https://api.crossref.org/works/{doi}"
    
    try:
        r = requests.get(url, timeout=10)

        if r.status_code != 200:
            return None

        data = r.json()["message"]
        
        # 'title' is returned as a list of strings in Crossref metadata
        titles = data.get("title", [])

        # Return the primary title if it exists
        return titles[0] if titles else None
    
    except Exception as e:
        print(f"Error retrieving title for DOI {doi}: {e}")
        return None

In [None]:
def download_cited_dois(doi):
    """
    Given a primary paper's DOI, fetches its reference list and attempts 
    to download the full-text PDF for every cited work found.
    """
    # 1. Fetch metadata for the parent paper from Crossref
    r = requests.get(
        f"{CROSSREF_API}/works/{doi}",
        timeout=10
    )
    r.raise_for_status()

    message = r.json()["message"]
    # Extract the list of cited references
    references = message.get("reference", [])

    for ref in references:
        ref_title = ref.get("volume-title")
        ref_doi = ref.get("DOI")

        # CASE A: The citation has a DOI (Most reliable)
        if ref_doi:
            # Try to find a legal Open Access PDF link via Unpywall
            url = Unpywall.get_pdf_link(doi=ref_doi)
            
            # Fetch the official title to use as a filename
            raw_title = get_title_from_doi(ref_doi)

            if raw_title:
                title = title_cleanup_manual(raw_title)
            else:
                title = "untitled" # Fallback for metadata-poor records

            if url:
                filename = f"{title}.pdf"
                
                # Sanity check: if cleaning the title resulted in an empty string
                if not title or title == "untitled":
                    # Use a sanitized DOI as the filename to prevent overwrites
                    filename = f"{ref_doi.replace('/', '_')}.pdf"
                
                print(f"Downloading cited paper via Unpywall: {filename}")
                download_pdf(url, filename)
        
        # CASE B: No DOI found, but we have a title (Fallback to arXiv)
        elif ref_title:
            print(f"No DOI for citation. Searching arXiv for: {ref_title}")
            search_arxiv_pdf(ref_title)

In [None]:
def full_reference_download(title=None, pdf_path=None):
    """
    The main pipeline for gathering research materials. 
    Can either:
    1. Parse a LOCAL PDF to find and download its references.
    2. Use a PAPER TITLE to find its DOI, download itself, and crawl its citations.
    """
    
    # PATH 1: Processing a Local PDF File
    if pdf_path:
        # Extract the bibliography section text
        ref_text = extract_references_from_pdf(pdf_path)
        # Identify individual citations
        raw_references = split_references(ref_text)
        # Clean and filter out noise/short strings
        normalized_references = [normalize_reference_text(r) for r in raw_references]
        final_references = [r for r in normalized_references if r]
        
        # Pull the title from each citation string
        titles = [manual_title_extraction(r) for r in final_references if manual_title_extraction(r)]
        
        for raw_title in titles:
            try:
                # Direct search on arXiv for these titles
                search_arxiv_pdf(raw_title)
            except Exception as e:
                print(f"Error downloading reference from arxiv: {raw_title}, Error: {e}")

    # PATH 2: Discovery via Paper Title
    if title:
        # Step 2a: Locate the unique identifier (DOI) for the paper
        try:
            ref_doi = find_doi_by_title(title)
        except Exception as e:
            print(f"Error finding DOI for title: {title}, Error: {e}")
            return # Exit if we can't identify the primary paper

        # Step 2b: Download the primary paper itself
        try:
            url = Unpywall.get_pdf_link(doi=ref_doi)
            raw_title = get_title_from_doi(ref_doi)

            # Cleanup name for file system compatibility
            if raw_title:
                title = title_cleanup_manual(raw_title)
            else:
                title = "untitled"
            
            if url:
                filename = f"{title}.pdf"
                if not title:
                    filename = f"{ref_doi.replace('/', '_')}.pdf"
                download_pdf(url, filename)
        except Exception as e:
            print(f"Error downloading main paper PDF: {title}, DOI: {ref_doi}, Error: {e}")
        try:
            # If we couldn't download via Unpywall, try arXiv as a fallback
            if not url:
                print(f"No Unpywall link found for DOI: {ref_doi}. Attempting arXiv search.")
                search_arxiv_pdf(title)
        except Exception as e:
            print(f"Error downloading main paper PDF from arXiv: {title}, Error: {e}")

        # Step 2c: Recursively download all citations listed in the metadata
        try:
            download_cited_dois(ref_doi)
        except Exception as e:
            print(f"Error downloading all cited DOIs for DOI: {ref_doi}, Error: {e}")
    
    print("‚úÖ Completed full reference download pipeline")

    # Safety check for missing arguments
    if not title and not pdf_path:
        raise ValueError("Must provide either a 'title' or a 'pdf_path' to initiate the pipeline.")

In [None]:
full_reference_download(title = "smooth numbers in short intervals", pdf_path="./pdfs/smooth numbers in short intervals.pdf")

In [None]:
# 1. Define your folder path
def partition_documents(input_dir: str):
    input_dir = Path(input_dir)
    all_elements = []

    # 2. Loop through every PDF in the directory
    for pdf_file in input_dir.glob("*.pdf"):
        print(f"Processing: {pdf_file.name}...")
        
        # 3. Partition the PDF into elements (Titles, NarrativeText, Tables, etc.)
        elements = partition_pdf(
            filename=str(pdf_file),
            # Strategy 'hi_res' is best for RAG as it identifies tables/images
            strategy="hi_res", 
            # Optional: merge small chunks to maintain context
            combine_text_under_n_chars=500,
            # Extract images or tables if needed for multimodal RAG
            extract_images_in_pdf=False 
        )
        
        all_elements.extend(elements)


    print(f"Total elements captured: {len(all_elements)}")
    return all_elements

In [None]:
def create_chunks_by_title(elements):
    """Create intelligent chunks using title-based strategy"""
    print("üî® Creating smart chunks...")
    
    chunks = chunk_by_title(
        elements, # The parsed PDF elements from previous step
        max_characters=3000, # Hard limit - never exceed 3000 characters per chunk
        new_after_n_chars=2400, # Try to start a new chunk after 2400 characters
        combine_text_under_n_chars=500 # Merge tiny chunks under 500 chars with neighbors
    )
    
    print(f"‚úÖ Created {len(chunks)} chunks")
    return chunks

In [None]:
def separate_content_types(chunk):
    """Analyze what types of content are in a chunk"""
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': ['text']
    }
    
    # Check for tables and images in original elements
    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__
            
            # Handle tables
            if element_type == 'Table':
                content_data['types'].append('table')
                table_html = getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)
            
            # Handle images
            elif element_type == 'Image':
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(set(content_data['types']))
    return content_data

In [None]:
def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create AI-enhanced summary for mixed content"""
    
    try:
        # Initialize LLM (needs vision model for images)
        llm = ChatGoogleGenerativeAI(
                    model="gemini-2.5-flash",
                    temperature=0.0,  # Gemini 3.0+ defaults to 1.0
                    max_tokens=None,
                    timeout=120000,
                    max_retries=10
                )
        
        # Build the text prompt
        prompt_text = f"""You are creating a searchable description for document content retrieval.

        CONTENT TO ANALYZE:
        TEXT CONTENT:
        {text}

        """
        
        # Add tables if present
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"
        
                prompt_text += """
                YOUR TASK:
                Generate a comprehensive, searchable description that covers:

                1. Key facts, numbers, and data points from text and tables
                2. Main topics and concepts discussed  
                3. Questions this content could answer
                4. Visual content analysis (charts, diagrams, patterns in images)
                5. Alternative search terms users might use

                Make it detailed and searchable - prioritize findability over brevity.

                SEARCHABLE DESCRIPTION:"""

        # Build message content starting with text
        message_content = [{"type": "text", "text": prompt_text}]
        
        # Add images to the message
        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })
        
        # Send to AI and get response
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        print(f"     ‚ùå AI summary failed: {e}")
        # Fallback to simple summary
        summary = f"{text[:300]}..."
        if tables:
            summary += f" [Contains {len(tables)} table(s)]"
        if images:
            summary += f" [Contains {len(images)} image(s)]"
        return summary

In [None]:
def summarise_chunks(chunks):
    """Process all chunks with AI Summaries"""
    print("üß† Processing chunks with AI Summaries...")
    
    langchain_documents = []
    total_chunks = len(chunks)
    
    for i, chunk in enumerate(chunks):
        current_chunk = i + 1
        print(f"   Processing chunk {current_chunk}/{total_chunks}")
        
        # Analyze chunk content
        content_data = separate_content_types(chunk)
        
        # Debug prints
        print(f"     Types found: {content_data['types']}")
        print(f"     Tables: {len(content_data['tables'])}, Images: {len(content_data['images'])}")
        
        # Create AI-enhanced summary if chunk has tables/images
        if content_data['tables'] or content_data['images']:
            print(f"     ‚Üí Creating AI summary for mixed content...")
            try:
                enhanced_content = create_ai_enhanced_summary(
                    content_data['text'],
                    content_data['tables'], 
                    content_data['images']
                )
                print(f"     ‚Üí AI summary created successfully")
                print(f"     ‚Üí Enhanced content preview: {enhanced_content[:200]}...")
            except Exception as e:
                print(f"     ‚ùå AI summary failed: {e}")
                enhanced_content = content_data['text']
        else:
            print(f"     ‚Üí Using raw text (no tables/images)")
            enhanced_content = content_data['text']
        
        # Create LangChain Document with rich metadata
        doc = Document(
            page_content=enhanced_content,
            metadata={
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "tables_html": content_data['tables'],
                    "images_base64": content_data['images']
                })
            }
        )
        
        langchain_documents.append(doc)
    
    print(f"‚úÖ Processed {len(langchain_documents)} chunks")
    return langchain_documents

In [None]:
def export_chunks_to_json(chunks, filename="chunks_export.json"):
    """Export processed chunks to clean JSON format"""
    export_data = []
    
    for i, doc in enumerate(chunks):
        chunk_data = {
            "chunk_id": i + 1,
            "enhanced_content": doc.page_content,
            "metadata": {
                "original_content": json.loads(doc.metadata.get("original_content", "{}"))
            }
        }
        export_data.append(chunk_data)
    
    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)
    
    print(f"‚úÖ Exported {len(export_data)} chunks to {filename}")
    return export_data

# Export your chunks
# json_data = export_chunks_to_json(processed_chunks)

In [None]:
def create_vector_store(documents, persist_directory="dbv1/chroma_db"):
    os.makedirs(persist_directory, exist_ok=True)
    """Create and persist ChromaDB vector store"""
    print("üîÆ Creating embeddings and storing in ChromaDB...")
        
    embedding_model = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
    
    # Create ChromaDB vector store
    print("--- Creating vector store ---")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory, 
        collection_metadata={"hnsw:space": "cosine"}
    )
    print("--- Finished creating vector store ---")
    
    print(f"‚úÖ Vector store created and saved to {persist_directory}")
    return vectorstore

In [None]:
def run_complete_ingestion_pipeline(dir_path: str):
    """Run the complete RAG ingestion pipeline"""
    print("üöÄ Starting RAG Ingestion Pipeline")
    print("=" * 50)
    
    # Step 1: Partition
    elements = partition_documents(dir_path)
    
    # Step 2: Chunk
    chunks = create_chunks_by_title(elements)
    
    # Step 3: AI Summarisation
    summarised_chunks = summarise_chunks(chunks)
    
    # Step 4: Vector Store
    db = create_vector_store(summarised_chunks, persist_directory="dbv2/chroma_db")
    
    print("üéâ Pipeline completed successfully!")
    return db

In [None]:
db = run_complete_ingestion_pipeline("./pdfs")

In [None]:
query = "what is the main result of the paper of ganguly from which soundarajan's paper on smooth numbers in short intervals is inspired?"
retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)

def generate_final_answer(chunks, query):
    """Generate final answer using multimodal content"""
    
    try:
        # Initialize LLM (needs vision model for images)
        llm = ChatGoogleGenerativeAI(
                    model="gemini-2.5-flash",
                    temperature=0.0,  # Gemini 3.0+ defaults to 1.0
                    max_tokens=None,
                    timeout=120000,
                    max_retries=3
                )
        
        # Build the text prompt
        prompt_text = f"""Based on the following documents, please answer this question: {query}

CONTENT TO ANALYZE:
"""
        
        for i, chunk in enumerate(chunks):
            prompt_text += f"--- Document {i+1} ---\n"
            
            if "original_content" in chunk.metadata:
                original_data = json.loads(chunk.metadata["original_content"])
                
                # Add raw text
                raw_text = original_data.get("raw_text", "")
                if raw_text:
                    prompt_text += f"TEXT:\n{raw_text}\n\n"
                
                # Add tables as HTML
                tables_html = original_data.get("tables_html", [])
                if tables_html:
                    prompt_text += "TABLES:\n"
                    for j, table in enumerate(tables_html):
                        prompt_text += f"Table {j+1}:\n{table}\n\n"
            
            prompt_text += "\n"
        
        prompt_text += """
Please provide a clear, comprehensive answer using the text, tables, and images above. If the documents don't contain sufficient information to answer the question, say "I don't have enough information to answer that question based on the provided documents."

ANSWER:"""

        # Build message content starting with text
        message_content = [{"type": "text", "text": prompt_text}]
        
        # Add all images from all chunks
        for chunk in chunks:
            if "original_content" in chunk.metadata:
                original_data = json.loads(chunk.metadata["original_content"])
                images_base64 = original_data.get("images_base64", [])
                
                for image_base64 in images_base64:
                    message_content.append({
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                    })
        
        # Send to AI and get response
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        print(f"‚ùå Answer generation failed: {e}")
        return "Sorry, I encountered an error while generating the answer."

# Usage
final_answer = generate_final_answer(chunks, query)
print(final_answer)