In [None]:
!pip install -q -U pymupdf llama-index-core llama-index-llms-llama-cpp llama-index-embeddings-huggingface llama-index-retrievers-bm25 langchain langchain-openai sentence-transformers google-generativeai
!apt-get install -y libgl1-mesa-glx
!pip install llama-index-llms-google-genai

In [None]:
import os
import re
import fitz
import torch
import numpy as np
from google.colab import files
from getpass import getpass
from collections import Counter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.schema import QueryBundle, NodeWithScore
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from llama_index.core import VectorStoreIndex, Document
from llama_index.llms.google_genai import GoogleGenAI

# Verify CUDA availability
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# 2. Document Upload & Setup

# Upload PDF documents
uploaded = files.upload()
doc_paths = {
    "Unknown 1": list(uploaded.keys())[0],
    "Unknown 2": list(uploaded.keys())[1],
    "Unknown 3": list(uploaded.keys())[2],
    "Unknown 4": list(uploaded.keys())[3],
}

# Extract text from uploaded PDFs
doc_texts = {}
for i, (doc_type, filename) in enumerate(doc_paths.items()):
    with fitz.open(filename) as doc:
        text = "\n".join([page.get_text() for page in doc])
        doc_texts[f"Doc-{i+1}"] = text
    print(f"Extracted {len(text.split())} words from {filename}")

In [None]:
# 3. Document Classification Components

# %%
# Set API keys
os.environ['OPENAI_API_KEY'] = getpass("Enter OpenAI API key: ")
os.environ['GOOGLE_API_KEY'] = getpass("Enter Google API key: ")

def prepare_document_for_classification(text):
    # Normalize text
    text = re.sub(r'\s+', ' ', text).strip()

    # Get document length
    doc_length = len(text)

    # Extract first, middle, and last portions
    first_part = text[:min(500, doc_length)]
    middle_start = max(0, doc_length // 2 - 250)
    middle_part = text[middle_start:middle_start + min(500, doc_length - middle_start)]
    last_start = max(0, doc_length - 500)
    last_part = text[last_start:]

    # Extract structural elements
    headers = re.findall(r'(?:^|\n)([A-Z\s]{5,50})(?:\n|$)', text)
    headers = headers[:10]  # Limit to first 10 headers

    tables = re.findall(r'\n\|.*\|.*\|\n', text)
    table_count = len(tables)

    dates = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text)
    amounts = re.findall(r'\$\s*\d+(?:,\d{3})*(?:\.\d{2})?', text)
    names = re.findall(r'\b([A-Z][a-z]+ [A-Z][a-z]+)\b', text)

    mortgage_keywords = ['mortgage', 'loan', 'interest', 'principal', 'amortization', 'lender', 'borrower', 'payment', 'contract', 'agreement', 'payslip', 'salary', 'income']
    keywords_found = [word for word in mortgage_keywords if word.lower() in text.lower()]

    word_freq = Counter(text.lower().split())
    top_words = dict(word_freq.most_common(10))

    return {
        "first_part": first_part,
        "middle_part": middle_part,
        "last_part": last_part,
        "total_length": doc_length,
        "potential_headers": "\n".join(headers),
        "table_count": table_count,
        "dates": dates,
        "amounts": amounts,
        "names": names,
        "keywords": keywords_found,
        "top_words": top_words
    }

In [None]:
def classify_document(text, llm):
    doc_info = prepare_document_for_classification(text)

    categories = [
        "Bank Statement", "Pay Slip", "Appraisal Report", "Loan Agreement",
        "Mortgage Contract", "Credit Report", "Employment Verification",
        "Tax Return", "Insurance Policy", "Title Report", "Unknown"
    ]

    prompt = f"""You are a document classification expert. Classify this document into one of these categories:
    {', '.join(categories[:-1])} or {categories[-1]}

    Here's information extracted from the document:

    DOCUMENT START EXCERPT:
    {doc_info['first_part']}
    DOCUMENT START EXCERPT END

    DOCUMENT MIDDLE EXCERPT:
    {doc_info['middle_part']}
    DOCUMENT MIDDLE EXCERPT END

    DOCUMENT END EXCERPT:
    {doc_info['last_part']}
    DOCUMENT END EXCERPT END

    Total document length: {doc_info['total_length']} characters

    Additional Information:
    - Potential Headers: {doc_info['potential_headers']}
    - Number of Tables: {doc_info['table_count']}
    - Dates Found: {', '.join(doc_info['dates']) if doc_info['dates'] else 'None'}
    - Amounts Found: {', '.join(doc_info['amounts']) if doc_info['amounts'] else 'None'}
    - Names Found: {', '.join(doc_info['names']) if doc_info['names'] else 'None'}
    - Keywords: {', '.join(doc_info['keywords']) if doc_info['keywords'] else 'None'}
    - Top 10 Words: {', '.join([f'{k}: {v}' for k, v in doc_info['top_words'].items()])}

    IMPORTANT INSTRUCTION: Your response must be EXACTLY ONE of these options:
    {', '.join(categories)}

    Do not include any explanation, reasoning, or additional text. Respond with ONLY the category name.
    """

    response = llm.complete(prompt)
    raw_response = response.text.strip()

    if raw_response in categories:
        return raw_response

    for category in categories:
        if category.lower() in raw_response.lower():
            return category

    words = raw_response.lower().split()
    keyword_map = {
        "Bank Statement": ['bank', 'statement', 'account', 'transaction'],
        "Pay Slip": ['pay', 'slip', 'salary', 'income', 'wage'],
        "Appraisal Report": ['appraisal', 'property', 'valuation', 'assessment'],
        "Loan Agreement": ['loan', 'agreement', 'borrower', 'lender'],
        "Mortgage Contract": ['mortgage', 'contract', 'deed', 'lien'],
        "Credit Report": ['credit', 'report', 'score', 'history'],
        "Employment Verification": ['employment', 'verification', 'job', 'employer'],
        "Tax Return": ['tax', 'return', 'irs', 'income'],
        "Insurance Policy": ['insurance', 'policy', 'coverage', 'premium'],
        "Title Report": ['title', 'report', 'ownership', 'lien']
    }

    for category, keywords in keyword_map.items():
        if any(kw in words for kw in keywords) or any(kw in doc_info['keywords'] for kw in keywords):
            return category

    for keyword in doc_info['keywords']:
        for category, keywords in keyword_map.items():
            if keyword in keywords:
                return category

    return "Unknown"

In [None]:
# 4. Query Processing Components

# Query rewriter setup
re_write_llm = ChatOpenAI(
    temperature=0,
    model_name="gpt-4o",
    max_tokens=4000,
    api_key=os.environ['OPENAI_API_KEY']
)

query_rewrite_template = """You are an AI assistant tasked with reformulating user queries to improve retrieval in a RAG system.
Given the original query, rewrite it to be more specific, detailed, and likely to retrieve relevant information.

Original query: {original_query}

Rewritten query:"""

query_rewrite_prompt = PromptTemplate(
    input_variables=["original_query"],
    template=query_rewrite_template
)

query_rewriter = query_rewrite_prompt | re_write_llm

def rewrite_query(original_query):
    """
    Rewrite the original query to improve retrieval.

    Args:
    original_query (str): The original user query

    Returns:
    str: The rewritten query
    """
    response = query_rewriter.invoke(original_query)
    return response.content

In [None]:
def route_query(query, llm, classified_docs):
    # Expanded list of categories for mortgage-related documents
    categories = [
        "Bank Statement", "Pay Slip", "Appraisal Report", "Loan Agreement",
        "Mortgage Contract", "Credit Report", "Employment Verification",
        "Tax Return", "Insurance Policy", "Title Report", "Unknown"
    ]

    # Check which document type the query is related to
    prompt = f"""
    Classify the following question into one of these categories:
    {', '.join(categories[:-1])} or {categories[-1]}

    IMPORTANT INSTRUCTION: Your response must be EXACTLY ONE of these options:
    {', '.join(categories)}

    Do not include any explanation, reasoning, or additional text. Respond with ONLY the category name.

    Query: {query}
    """

    doc_type = llm.complete(prompt).text.strip()

    raw_response = doc_type

    # Post-process to extract just the category name
    if raw_response in categories:
        doc_type = raw_response

    # If not, look for the category within the response
    for category in categories:
        if category.lower() in raw_response.lower():
            doc_type = category

    # If still no match, return the closest match based on keywords
    words = raw_response.lower().split()
    keyword_map = {
        "Bank Statement": ['bank', 'statement', 'account', 'transaction'],
        "Pay Slip": ['pay', 'slip', 'salary', 'income', 'wage'],
        "Appraisal Report": ['appraisal', 'property', 'valuation', 'assessment'],
        "Loan Agreement": ['loan', 'agreement', 'borrower', 'lender'],
        "Contract": ['agreement', 'contract', 'mortgage', 'deed', 'lien'],
        "Credit Report": ['credit', 'report', 'score', 'history'],
        "Employment Verification": ['employment', 'verification', 'job', 'employer'],
        "Tax Return": ['tax', 'return', 'irs', 'income'],
        "Insurance Policy": ['insurance', 'policy', 'coverage', 'premium'],
        "Title Report": ['title', 'report', 'ownership', 'lien']
    }

    for category, keywords in keyword_map.items():
        if any(kw in words for kw in keywords):
            doc_type = category
            break
    else:
        doc_type = "Unknown"

     # Filter documents based on the determined document type
    relevant_docs = [
        doc_data["document"] for doc_data in classified_docs.values()
        if doc_data["doc_type"] == doc_type
    ]

    # Debug print to check if any documents were found
    print(f"Number of relevant documents found for query '{query}': {len(relevant_docs)}")

    # Fallback mechanism: If no relevant documents are found, use all documents
    if not relevant_docs:
        print("No relevant documents found. Using all documents as fallback.")
        relevant_docs = [doc_data["document"] for doc_data in classified_docs.values()]

    return relevant_docs

In [None]:
# 5. Retrieval & Ranking Setup

# Download model for LlamaCPP
!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O mistral-7b-instruct.gguf

# Initialize models
llm = LlamaCPP(
    model_path="mistral-7b-instruct.gguf",
    temperature=0.0,
    max_new_tokens=30,
    context_window=4096,
    model_kwargs={"n_gpu_layers": 1, "verbose": False}
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

def semantic_chunk(docs, embed_model):
    """Modified to handle LlamaIndex Documents directly"""
    # Create a sentence splitter for initial text splitting
    sentence_splitter = SentenceSplitter(chunk_size=512)

    # Create a semantic splitter for more meaningful chunks
    semantic_splitter = SemanticSplitterNodeParser(
        buffer_size=1,
        breakpoint_percentile_threshold=95,
        embed_model=embed_model
    )

    nodes = []
    for doc in docs:
        # Split document into sentences first
        split_sentences = sentence_splitter.get_nodes_from_documents([doc])

        # Then apply semantic splitting to the sentences
        semantic_nodes = semantic_splitter.get_nodes_from_documents(split_sentences)
        nodes.extend(semantic_nodes)

    print(f"Number of {docs[0].metadata['doc_type']} nodes after splitting: {len(nodes)}")
    top_k_bm25 = min(3, len(nodes))
    return nodes, top_k_bm25

In [None]:
from typing import List

class HybridRetriever(BaseRetriever):
    def __init__(
        self,
        vector_retriever,
        bm25_retriever,
        vector_weight: float = 0.5,
        top_k_per_retriever: int = 10,
        top_n: int = 10,
        dedup_threshold: float = 0.9,
    ):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        self.vector_weight = vector_weight
        self.top_k_per_retriever = top_k_per_retriever
        self.top_n = top_n
        self.dedup_threshold = dedup_threshold
        super().__init__()

    def _normalize_scores(self, nodes: List[NodeWithScore]) -> List[NodeWithScore]:
        if not nodes:
            return []
        scores = [node.score for node in nodes]
        min_score, max_score = min(scores), max(scores)
        for node in nodes:
            if max_score - min_score == 0:
                node.score = 0.0
            else:
                node.score = (node.score - min_score) / (max_score - min_score)
        return nodes

    def _deduplicate_nodes(self, nodes: List[NodeWithScore]) -> List[NodeWithScore]:
        deduped = []
        seen_ids = set()
        for node in nodes:
            if node.node.node_id not in seen_ids:
                seen_ids.add(node.node.node_id)
                deduped.append(node)
        return deduped

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        # Parallel retrieval (example using threading)
        import threading
        vector_results = []
        bm25_results = []

        def fetch_vector():
            nonlocal vector_results
            vector_results = self.vector_retriever.retrieve(query_bundle)[:self.top_k_per_retriever]

        def fetch_bm25():
            nonlocal bm25_results
            bm25_results = self.bm25_retriever.retrieve(query_bundle)[:self.top_k_per_retriever]

        t1 = threading.Thread(target=fetch_vector)
        t2 = threading.Thread(target=fetch_bm25)
        t1.start()
        t2.start()
        t1.join()
        t2.join()

        # Normalize and fuse scores
        vector_nodes = self._normalize_scores(vector_results)
        bm25_nodes = self._normalize_scores(bm25_results)

        combined_nodes = []
        for node in vector_nodes:
            combined_score = self.vector_weight * node.score
            combined_nodes.append((node, combined_score))
        for node in bm25_nodes:
            combined_score = (1 - self.vector_weight) * node.score
            combined_nodes.append((node, combined_score))

        # Sort by combined score and deduplicate
        combined_nodes.sort(key=lambda x: x[1], reverse=True)
        deduped_nodes = self._deduplicate_nodes([node for node, _ in combined_nodes])

        # Apply final top_n limit
        return deduped_nodes[:self.top_n]

In [None]:
# 6. Document Classification Execution

# Classify documents
print("CLASSIFYING DOCUMENTS...")
classified_docs = {}
for doc_id, text in doc_texts.items():
    doc_type = classify_document(text, llm)
    classified_docs[doc_id] = {
        "text": text,
        "doc_type": doc_type,
        "document": Document(text=text, metadata={"doc_type": doc_type})
    }
    print(f"Classified {doc_id} as: {doc_type}")

In [None]:
# 7. Query Execution

# Process query
print("\nPROCESSING QUERY...")
original_query = input("Enter your query: ")
rewritten_query = rewrite_query(original_query)
print(f"Original: {original_query}\nRewritten: {rewritten_query}")

# Route query
print("\nQUERY ROUTING...")
query_route_results = route_query(rewritten_query, llm, classified_docs)

# Prepare retrieval system
nodes, top_k_bm25 = semantic_chunk(query_route_results, embed_model)
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

vector_index = VectorStoreIndex.from_documents(query_route_results, embed_model=embed_model)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
bm25_retriever = BM25Retriever.from_defaults(docstore=docstore, similarity_top_k=3)

hybrid_retriever = HybridRetriever(
    vector_retriever=vector_retriever,
    bm25_retriever=bm25_retriever
)

# Final query execution
llm = GoogleGenAI(model="gemini-1.5-flash")
query_engine = RetrieverQueryEngine.from_args(hybrid_retriever, llm=llm)
response = query_engine.query(rewritten_query)

print("\nFINAL RESPONSE:")
print(response)