<a href="https://colab.research.google.com/github/RaghavG189/Python-Projects/blob/main/Route_Queries_Within_a_Large_Contract_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index
!pip install llama-index-readers-file
!pip install llama-index transformers sentence-transformers
!pip install llama-index-embeddings-huggingface
!pip install PyPDF2



In [None]:
from PyPDF2 import PdfReader
from google.colab import files
uploaded = files.upload()
reader = PdfReader(list(uploaded.keys())[0])

pages = [page.extract_text() for page in reader.pages]
doc_pages = [{"page_num": i, "page_text": p} for i, p in enumerate(pages)]
doc_pages

Saving Test Blob File.pdf to Test Blob File (5).pdf


[{'page_num': 0,
  'page_text': 'Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.\nFee Details and Summary\nApplicants: Application No:\nDate Prepared:\nLoan Program:Prepared By:\nTHIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees W orksheet" is provided for informational purposes ONLY, to assist\nyou in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage \npayment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.\nTotal Loan Amount:  Interest Rate: Term/Due In:\nFee Paid To Paid By (Fee Split**) Amount PFC / F / POC\nTOTAL ESTIMATED FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED MONTHLY PAYMENT:\nTotal Estimated Funds Total Monthly PaymentPurchase Price (+)\nAlterations (+)\nLand (+)\nRefi (incl. debts to be paid off) (+)\nEst. Prepaid Items/Reserves (+)\nEst. Closing Costs (+)Loan Amount (-) Principal & Interest\nOther Fin

In [None]:
def gemini(prompt):
  import google.generativeai as genai

  genai.configure(api_key="")

  model = genai.GenerativeModel("models/gemini-2.0-flash")
  response = model.generate_content(prompt)

  return response.text

In [None]:
def is_same_document(prev_text, curr_text, doc_type=None):
    prompt = f"""
    You are checking whether two pages belong to the same document.
    Previous page type: {doc_type or 'unknown'}

    Previous Page:
    {prev_text}

    Current Page:
    {curr_text}

    Answer ONLY 'Yes' or 'No'. Do NOT explain.
    """
    response = gemini(prompt)  # Swap in LLM call
    return response.strip().lower().startswith("yes")

In [None]:
def classify_document_type(text):
    prompt = f"""
    This is the start of a new document. Based on the content, classify it.

    Page Content:
    {text}

    Choose from: Resume, Contract, Lender Fee Sheet, ID, PaySlip, Other.
    Just respond with the type.
    """
    response = gemini(prompt).strip().lower().replace(".", "")
    result = response.title() # Capitalize the first letter of each word
    return result

In [None]:
metadata_store = []
current_doc_type = None
page_num = 0
same = False
for i, page in enumerate(doc_pages):
  if i == 0:
    current_doc_type = classify_document_type(page['page_text'])
  else:
    prev_text = doc_pages[i-1]['page_text']
    curr_text = page['page_text']
    same = is_same_document(prev_text, curr_text, current_doc_type)

    if not same:
      page_num = 0
      current_doc_type = classify_document_type(page['page_text'])
    else:
      page_num += 1

  metadata_store.append({
      "page": i,
      "text": page['page_text'],
      "is_new_doc": not same,
      "doc_type": current_doc_type,
      "page_in_doc": page_num
  })
metadata_store

[{'page': 0,
  'text': 'Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.\nFee Details and Summary\nApplicants: Application No:\nDate Prepared:\nLoan Program:Prepared By:\nTHIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees W orksheet" is provided for informational purposes ONLY, to assist\nyou in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage \npayment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.\nTotal Loan Amount:  Interest Rate: Term/Due In:\nFee Paid To Paid By (Fee Split**) Amount PFC / F / POC\nTOTAL ESTIMATED FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED MONTHLY PAYMENT:\nTotal Estimated Funds Total Monthly PaymentPurchase Price (+)\nAlterations (+)\nLand (+)\nRefi (incl. debts to be paid off) (+)\nEst. Prepaid Items/Reserves (+)\nEst. Closing Costs (+)Loan Amount (-) Principal & Interest\nOther Financing (P

In [None]:
logical_docs = []
current_doc = {"text": "", "doc_type": None, "page_start": 0}

for page in metadata_store:
    if page["is_new_doc"] == True:
        if current_doc:
            logical_docs.append(current_doc)
        current_doc = {
            "text": page["text"],
            "doc_type": page["doc_type"],
            "page_start": page["page"],
            "page_end": page["page"],
            "source_file": "blob_file_sample.pdf"
        }
    else:
        current_doc["text"] += "\n\n" + page["text"]
        current_doc["page_end"] = page["page"]

logical_docs.append(current_doc)

In [None]:
from llama_index.core.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)

"""
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
"""
chunked_documents = []

for idx, doc in enumerate(logical_docs):
    chunks = splitter.split_text(doc['text'])
    for chunk_idx, chunk in enumerate(chunks):
        chunked_documents.append(
            Document(
                text=chunk,
                metadata={
                    "doc_type": doc["doc_type"],
                    "chunk_index": chunk_idx,
                    "page_start": doc["page_start"],
                    "page_end": doc["page_end"],
                    "source_file": "Sample Blob File" #replace with your document name
                }
            )
        )

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
index = VectorStoreIndex.from_documents(chunked_documents, embed_model=embed_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def predict_doc_type_for_query(query):

    prompt = f"""
    You are an intelligent assistant that routes user queries to the most relevant document.

    User query: "{query}"

    Which document type is most likely to contain the answer?
    Choose ONLY ONE from: Resume, Contract, Lender Fee Sheet, ID, PaySlip, Other.
    """

    response = gemini(prompt)
    return response.strip()

In [None]:
from llama_index.core.vector_stores import MetadataFilters, FilterOperator, MetadataFilter

# Get doc_type filter
user_query = "What is the total loan amount and the purchase price in the fees worksheet?"
# Normalizing the predicted doc type to Title Case to match the stored metadata
predicted_doc_type = predict_doc_type_for_query(user_query).title()
print(predicted_doc_type)
# Retrieve
retriever = index.as_retriever(
    filters=MetadataFilters(
        filters=[
            MetadataFilter(key="doc_type", value=predicted_doc_type, operator=FilterOperator.EQ)
        ]
    )
)
results = retriever.retrieve(user_query)

for res in results:
    print(f"[{res.metadata['doc_type']} - p{res.metadata['page_start']}–{res.metadata['page_end']}]")
    print(res.text[:200])
    print("-" * 50)

Lender Fee Sheet
[Lender Fee Sheet - p0–0]
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan. Fee Details and Summary Applicants: Application No: Date Prepared: Loan Program:Prepared By: 
--------------------------------------------------


In [None]:
# Extract the answer from the retrieved text
if results:
    # We'll take the text from the first and most relevant result
    retrieved_text = results[0].text

    # Create a new prompt to ask the model for the specific answer
    prompt = f"""
    Based on the following text, please answer the user's question.

    Text:
    {retrieved_text}

    User Question: {user_query}

    Provide only the specific value or amount as the answer.
    """

    # Get the answer from the model
    answer = gemini(prompt)

    print(f"The answer to your question '{user_query}' is: {answer}")
else:
    print("Could not find an answer as no relevant documents were retrieved.")

The answer to your question 'What is the total loan amount and the purchase price in the fees worksheet?' is: Total Loan Amount: 380,000.00
Purchase Price: 475,000.00

