Install necessary packages

In [None]:
!pip install llama-index llama-index-llms-openai pymupdf llama-index-embeddings-huggingface langchain langchain_openai
!pip install llama-index-retrievers-bm25
!pip install llama-index-readers-file pymupdf
!pip install --upgrade sympy
!pip install llama_index.llms.gemini

Load document

In [None]:
import fitz  # PyMuPDF

# Load PDF document
doc = fitz.open("LenderFeesWorksheetNew.pdf")

# Extract text from all pages
text = "\n".join([page.get_text() for page in doc])

print(f"Extracted {len(text.split())} words from the PDF.")

Process Query (query expansion and rewriting) using OpenAI API

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from google.colab import userdata

# improve query processing with query expansion
# query expansion and rewriting implementation.
# Initialize OpenAI LLM

re_write_llm = ChatOpenAI(
    temperature=0,
    model_name="gpt-4o",
    max_tokens=4000,
    api_key=userdata.get('OPENAI_API_KEY')
    )

# Create a prompt template for query rewriting
query_rewrite_template = """You are an AI assistant tasked with reformulating user queries to improve retrieval in a RAG system.
Given the original query, rewrite it to be more specific, detailed, and likely to retrieve relevant information.

Original query: {original_query}

Rewritten query:"""

query_rewrite_prompt = PromptTemplate(
    input_variables=["original_query"],
    template=query_rewrite_template
)

# Create an LLM Chain for query rewriting
query_rewriter = query_rewrite_prompt | re_write_llm

# define rewrite query function
def rewrite_query(original_query):
    """
    Rewrite the original query to improve retrieval.

    Args:
    original_query (str): The original user query

    Returns:
    str: The rewritten query
    """
    response = query_rewriter.invoke(original_query)
    return response.content

NEED TO IMPLEMENT SEMANTIC CHUNKING HERE!!!!

---



In [None]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.schema import Document
import os
from google.colab import userdata

# Assuming 'doc' is your list of documents from PyMuPDFReader
documents = doc

# Initialize OpenAI embedding model
embed_model = OpenAIEmbedding(api_key=userdata.get('OPENAI_API_KEY'))

# Create a semantic splitter for more meaningful chunks
semantic_splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)

# Create a sentence splitter for initial text splitting
sentence_splitter = SentenceSplitter(chunk_size=512)

# Convert Page objects to Document objects
documents_converted = []
for page in documents:
    # Extract text from the page content
    # Assuming PyMuPDF's Page object has a method or attribute to get text
    # If 'get_text()' is the correct method, use it; otherwise, adjust accordingly
    page_text = page.get_text()  # This might need to be adjusted based on PyMuPDF's API
    doc = Document(text=page_text)
    documents_converted.append(doc)

# Apply sentence splitting to documents
nodes = []
for document in documents_converted:
    # Use the sentence splitter to split the document
    split_sentences = sentence_splitter.get_nodes_from_documents([document])

    # Then, apply semantic splitting to the sentences
    semantic_nodes = semantic_splitter.get_nodes_from_documents(split_sentences)
    nodes.extend(semantic_nodes)

# Now `nodes` contains the semantically split nodes

Create hybrid retriever (best for this case)

In [None]:
from llama_index.core.retrievers import BaseRetriever
from typing import List, Optional, Tuple
from llama_index.core.schema import NodeWithScore, QueryBundle
import numpy as np
from llama_index.core import VectorStoreIndex
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.storage.docstore import SimpleDocumentStore

class HybridRetriever(BaseRetriever):
    def __init__(
        self,
        vector_retriever,
        bm25_retriever,
        vector_weight: float = 0.5,
        top_k_per_retriever: int = 10,
        top_n: int = 10,
        dedup_threshold: float = 0.9,
    ):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        self.vector_weight = vector_weight
        self.top_k_per_retriever = top_k_per_retriever
        self.top_n = top_n
        self.dedup_threshold = dedup_threshold
        super().__init__()

    def _normalize_scores(self, nodes: List[NodeWithScore]) -> List[NodeWithScore]:
        if not nodes:
            return []
        scores = [node.score for node in nodes]
        min_score, max_score = min(scores), max(scores)
        for node in nodes:
            if max_score - min_score == 0:
                node.score = 0.0
            else:
                node.score = (node.score - min_score) / (max_score - min_score)
        return nodes

    def _deduplicate_nodes(self, nodes: List[NodeWithScore]) -> List[NodeWithScore]:
        deduped = []
        seen_ids = set()
        for node in nodes:
            if node.node.node_id not in seen_ids:
                seen_ids.add(node.node.node_id)
                deduped.append(node)
        return deduped

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        # Parallel retrieval (example using threading)
        import threading
        vector_results = []
        bm25_results = []

        def fetch_vector():
            nonlocal vector_results
            vector_results = self.vector_retriever.retrieve(query_bundle)[:self.top_k_per_retriever]

        def fetch_bm25():
            nonlocal bm25_results
            bm25_results = self.bm25_retriever.retrieve(query_bundle)[:self.top_k_per_retriever]

        t1 = threading.Thread(target=fetch_vector)
        t2 = threading.Thread(target=fetch_bm25)
        t1.start()
        t2.start()
        t1.join()
        t2.join()

        # Normalize and fuse scores
        vector_nodes = self._normalize_scores(vector_results)
        bm25_nodes = self._normalize_scores(bm25_results)

        combined_nodes = []
        for node in vector_nodes:
            combined_score = self.vector_weight * node.score
            combined_nodes.append((node, combined_score))
        for node in bm25_nodes:
            combined_score = (1 - self.vector_weight) * node.score
            combined_nodes.append((node, combined_score))

        # Sort by combined score and deduplicate
        combined_nodes.sort(key=lambda x: x[1], reverse=True)
        deduped_nodes = self._deduplicate_nodes([node for node, _ in combined_nodes])

        # Apply final top_n limit
        return deduped_nodes[:self.top_n]

docstore = SimpleDocumentStore()
docstore.add_documents(documents_converted)

# Create a vector index for embedding-based retrieval
vector_index = VectorStoreIndex.from_documents(documents_converted, embed_model=embed_model)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)

# Create a BM25 keyword-based retriever
bm25_retriever = BM25Retriever.from_defaults(docstore=docstore, similarity_top_k=3)

# Combine both retrievers into a Hybrid Retriever
hybrid_retriever = HybridRetriever(
    vector_retriever=vector_retriever,
    bm25_retriever=bm25_retriever,
    # alpha=0.5
)

Use hybrid retrieval - need to test implementation after semantic chunking is complete.

In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core.query_engine import RetrieverQueryEngine
from google.colab import userdata

# Set up query engine with hybrid retrieval
llm = Gemini(model="models/gemini-1.5-flash", api_key=userdata.get('GOOGLE_API_KEY'))
query_engine = RetrieverQueryEngine.from_args(hybrid_retriever, llm=llm)

# Test hybrid retrieval
query = "What is the total estimated monthly payment?"
response = query_engine.query(query)

print("Initial Response:", response)

then rerank vector store and get result.

In [None]:
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.schema import QueryBundle

# Initialize reranker
reranker = LLMRerank(
    llm=llm,
    top_n=3,
)

# Get the retrieved results
retrieved_nodes = response.source_nodes
reranked_nodes = reranker.postprocess_nodes(
    retrieved_nodes,
    query_bundle=QueryBundle(query_str=query)
)

print("Top-ranked result:", reranked_nodes[0].node.text)

# Test with rewritten query
rewritten_query = rewrite_query(query)
print("Rewritten Query:", rewritten_query)
response_rewritten = query_engine.query(rewritten_query)
print("Response with Rewritten Query:", response_rewritten)

# Rerank the results from the rewritten query
reranked_nodes_rewritten = reranker.postprocess_nodes(
    response_rewritten.source_nodes,
    query_bundle=QueryBundle(query_str=rewritten_query)
)
print("Top-ranked result with Rewritten Query:", reranked_nodes_rewritten[0].node.text)

In [None]:
# Query: "How much does the borrower pay for lender's title insurance?"
query = "How much does the borrower pay for lender's title insurance?"
response = query_engine.query(query)

# Get the retrieved results
retrieved_nodes = response.source_nodes
reranked_nodes = reranker.postprocess_nodes(
    retrieved_nodes,
    query_bundle=QueryBundle(query_str=query)
)

print("Top-ranked result:", reranked_nodes[0].node.text)

# Test with rewritten query
rewritten_query = rewrite_query(query)
print("Rewritten Query:", rewritten_query)
response_rewritten = query_engine.query(rewritten_query)
print("Response with Rewritten Query:", response_rewritten)

# Rerank the results from the rewritten query
reranked_nodes_rewritten = reranker.postprocess_nodes(
    response_rewritten.source_nodes,
    query_bundle=QueryBundle(query_str=rewritten_query)
)
print("Top-ranked result with Rewritten Query:", reranked_nodes_rewritten[0].node.text)
