<a href="https://colab.research.google.com/github/RaghavG189/Python-Projects/blob/main/Route_Queries_Across_Multiple_PDF_Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index
!pip install llama-index-readers-file

In [None]:
from google.colab import files
from llama_index.readers.file import PDFReader
import time

uploaded = files.upload()
loader = PDFReader()

all_documents = [] # Initialize an empty list to store documents
dfiles = list(uploaded.keys())

for dfile in dfiles:
  # Load data for each file and extend the list
  documents = loader.load_data(dfile)
  all_documents.append(documents)
  print(f"Loaded {len(documents)} pages from {dfile}")

# Now all_documents contains all pages from all uploaded PDFs
print(f"\nTotal loaded documents: {len(all_documents)}")

In [None]:
import uuid

pdf_metadata_store = []

for dfile, documents in zip(dfiles, all_documents):
  file_id = str(uuid.uuid4())  # Assigns Unique ID
  user_id = "xyz"
  year = "2024"
  filename = f"{dfile}"

  for i, doc in enumerate(documents):
     metadata = {
        "file_id": file_id,
        "user_id": user_id,
        "doc_type": "unknown",  # We'll classify it later
        "year": year,
        "filename": filename,
        "page_number": i + 1,
        "text": doc.text
     }
     pdf_metadata_store.append(metadata)

In [None]:
def gemini_model(prompt):
    import google.generativeai as genai

    genai.configure(api_key="")

    model = genai.GenerativeModel("models/gemini-2.0-flash")
    response = model.generate_content(prompt)

    return response.text

In [None]:
def classify_query_llm(query, metadata_store):
    doc_list = "\n".join(
        [f"{i+1}. {doc['filename']} — doc_type: {doc['doc_type']}" for i, doc in enumerate(metadata_store)]
    )

    prompt = f"""
  You are an intelligent assistant that routes user queries to the most relevant document.

  Available documents:
  {doc_list}

  User query: "{query}"

  Which document(s) are most likely to contain the answer?
  Respond with one of the following types:
  ["pay_stub", "loan_form", "resume", "contract", "w2", "unknown"]
  Return just the word.
  """

    print(prompt)
    response = gemini_model(prompt)
    return response.strip()

In [None]:
query = "What are the fees mentioned in the documents?"
predicted_doc_type = classify_query_llm(query, pdf_metadata_store)
print(predicted_doc_type)

In [None]:
def classify_doc_type_llm(text, max_chars=5000):
    # Truncate text to avoid token overflow
    truncated_text = text[:max_chars]

    prompt = f"""
You are classifying a document into one of the following types:
["pay_stub", "loan_form", "resume", "contract", "w2", "unknown"]

Document content:
\"\"\"
{truncated_text}
\"\"\"

What is the best doc_type label for this document? Respond with only one of the labels above.
"""
    try:
        response = gemini_model(prompt)
        return response.strip().lower()
    except Exception as e:
        print("LLM failed:", e)
        return "unknown"

In [None]:
for i, doc in enumerate(pdf_metadata_store):
    print(f"Classifying doc {i+1}...")
    doc["doc_type"] = classify_doc_type_llm(doc["text"])
    print(doc["doc_type"])
    #time.sleep(1)  # Optional: avoid rate limiting

In [None]:
def retrieve_files_by_doc_type(doc_type, metadata_store):
    return [doc for doc in metadata_store if doc["doc_type"] == doc_type]

# Call the function using the predicted type
matched_files = retrieve_files_by_doc_type(predicted_doc_type, pdf_metadata_store)

# Display matched documents
print("Matched Documents:")
for doc in matched_files:
  print(doc)

In [None]:
import json

result = {
    "query": query,
    "predicted_document_type": predicted_doc_type,
    "matched_documents": matched_files
}
json.dumps(result, indent=2)