Install and import necessary libraries

In [1]:
!pip install -q llama-index llama-index-llms-gemini pymupdf
!pip install -q llama-index-embeddings-huggingface
!pip install llama-index-retrievers-bm25

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m113.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Import necessary libraries and setup the environment

In [20]:
from google.colab import files
import fitz
import os
from llama_index.core import Document
from typing import List
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore

Set up google API key for Gemini

In [22]:
GOOGLE_API_KEY = ""
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

Load PDF and convert to llamaindex format

In [5]:
def load_pdf(pdf_path: str) -> List[Document]:
    """Load a PDF and convert it to LlamaIndex Document format using PyMuPDF."""
    doc = fitz.open(pdf_path)
    documents = []

    for i, page in enumerate(doc):
        text = page.get_text()
        if not text.strip():
            continue
        documents.append(
            Document(
                text=text,
                metadata={
                    "file_name": os.path.basename(pdf_path),
                    "page_number": i + 1,
                    "total_pages": len(doc)
                }
            )
        )
    doc.close()
    print(f"Processed {pdf_path}:")
    print(f"Extracted {len(documents)} pages with content")
    return documents

Initialize Gemini and Embedding Model

In [21]:
# Initialize Gemini LLM
llm = Gemini(model="models/gemini-1.5-flash")
Settings.llm = llm

# Initialize embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.embed_model = embed_model
splitter = SemanticSplitterNodeParser( # Creates semantic splitter with embedding model
    buffer_size = 1,
    breakpoint_percentile_threshold = 95, # How sensitive to change meaning
    embed_model = embed_model

)

  llm = Gemini(model="models/gemini-1.5-flash")


Processes PDF and creates vector and keyword indices

In [7]:
def process_and_index_pdf(pdf_path):
    documents = load_pdf(pdf_path)
    nodes = splitter.get_nodes_from_documents(documents)
    vector_index = VectorStoreIndex(nodes)
    print(f"Indexed {len(documents)} document chunks")
    return vector_index

Build RAG Pipeline

In [14]:
def build_rag_pipeline(index):
    nodes = list(index.docstore.docs.values()) # Gets all chunks of text that were created when PDF was indexed
    num_nodes = len(nodes) # Stores how many chunks there are
    safe_top_k = min(2, max(1, num_nodes)) # Retrieves the minimum value for top k

    vector_retriever = index.as_retriever(similarity_top_k=safe_top_k) # Uses embeddings to find chunks that are semantically similar
    bm25_retriever = BM25Retriever.from_defaults( # Uses keyword search to find exact terms in chunks found in the query
        nodes=nodes,
        similarity_top_k=safe_top_k
    )

    class HybridRetriever(BaseRetriever): #Custom class to combine both vector and keyword search
        def __init__(self, vector_retriever, keyword_retriever, top_k=2):
            self.vector_retriever = vector_retriever
            self.keyword_retriever = keyword_retriever
            self.top_k = top_k
            super().__init__()

        def _retrieve(self, query_bundle, **kwargs):
            vector_nodes = self.vector_retriever.retrieve(query_bundle)
            keyword_nodes = self.keyword_retriever.retrieve(query_bundle)
            all_nodes = list(vector_nodes) + list(keyword_nodes)
            unique_nodes = {node.node_id: node for node in all_nodes}
            sorted_nodes = sorted(
                unique_nodes.values(),
                key=lambda x: x.score if hasattr(x, 'score') else 0.0,
                reverse=True
            )
            return sorted_nodes[:self.top_k]

    hybrid_retriever = HybridRetriever( # Creates instance of class defined above
        vector_retriever=vector_retriever,
        keyword_retriever=bm25_retriever,
        top_k=safe_top_k
    )

    if num_nodes > 1:
        reranker = SentenceTransformerRerank( #Checks which chunk is most relevant to original query
            model="cross-encoder/ms-marco-MiniLM-L-12-v2", # More powerful than l-6
            top_n=min(2, num_nodes)
        )
        node_postprocessors = [reranker]
    else:
        node_postprocessors = []

    fusion_retriever = QueryFusionRetriever( #Creates multiple versions of the user's query
        retrievers=[hybrid_retriever],
        llm=llm,
        similarity_top_k=2,
        num_queries=3,  # Generate 3 queries per original query
        mode="reciprocal_rerank"
    )

    query_engine = RetrieverQueryEngine.from_args( # Takes fusion retriever and reranker and combines them
        retriever=fusion_retriever,
        llm=llm,
        node_postprocessors=node_postprocessors
    )
    return query_engine # Returns output

Upload PDF Document

In [15]:
print("Please select a PDF file to upload.")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

Please select a PDF file to upload.


Saving LenderFeesWorksheetNew.pdf to LenderFeesWorksheetNew (2).pdf


Run the Query

In [17]:
index = process_and_index_pdf(pdf_path)
rag_engine = build_rag_pipeline(index)
print("Type 'Exit' to stop.")
while True:
  user_input = input()
  if user_input == 'Exit':
    break
  response = rag_engine.query(user_input)
  print('\nFinal Response:\n ---------------------- \n')
  print(response)

DEBUG:bm25s:Building index from IDs objects


Processed LenderFeesWorksheetNew (2).pdf:
Extracted 1 pages with content
Indexed 1 document chunks
Type 'Exit' to stop.
What is the total estimated monthly payment?

Final Response:
 ---------------------- 

The total estimated monthly payment is $1,869.37.

Exit
