In [19]:
from llama_index.core import VectorStoreIndex, Document
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
from llama_index.core.node_parser import SentenceSplitter  # For chunking
from PyPDF2 import PdfReader
from pathlib import Path
import os
from dotenv import load_dotenv
import time  # For rate limiting

# Load environment variables
load_dotenv()

True

In [20]:
# 1. Process PDFs with chunking
def get_pdf_docs(pdf_folder="financial_pdfs", chunk_size=512):
    docs = []
    splitter = SentenceSplitter(chunk_size=chunk_size)
    
    for pdf_file in Path(pdf_folder).glob("*.pdf"):
        reader = PdfReader(pdf_file)
        full_text = "\n".join([page.extract_text() for page in reader.pages])
        
        # Create a base document
        base_doc = Document(text=full_text, metadata={"source": pdf_file.name, "type": "10-K"})
        
        # Split text into manageable chunks
        nodes = splitter.get_nodes_from_documents([base_doc])
        for node in nodes:
            # Convert nodes back to Document objects
            docs.append(Document(
                text=node.text,
                metadata=node.metadata
            ))
    
    return docs

In [21]:
# 2. Initialize Groq with rate limiting
def initialize_groq():
    groq_api_key = os.getenv("GROQ_API_KEY")
    if not groq_api_key:
        raise ValueError("GROQ_API_KEY not found in .env file")
    return Groq(model="mixtral-8x7b-32768", api_key=groq_api_key)  # Larger context model


In [29]:
def generate_answer(llm, query, contexts):
    combined_context = "\n".join(contexts)[:4000]  # Increased context size
    prompt = f"""You are a financial analyst. Answer the following question based on the provided context.
    
    Question: {query}
    
    Relevant Context: {combined_context}
    
    Instructions:
    - If the question involves comparisons, rankings, or multiple data points, format the answer as a complete table with all requested data.
    - If the question requires step-by-step explanations or lists, use numbered or bullet points.
    - If data is missing, explicitly state "Data not available in the provided context."
    - Do not use placeholders like X, Y, or Z. If data is missing, leave the cell blank and add a note.
    - Keep the answer concise, accurate, and well-formatted.
    
    Answer:"""
    
    try:
        response = llm.complete(
            prompt=prompt,
            max_tokens=500,
            temperature=0.3
        )
        return response.text.strip()
    except Exception as e:
        return f"Error generating answer: {str(e)}"

In [30]:
# 4. Interactive hybrid search
def main():
    # Load documents with chunking
    financial_docs = get_pdf_docs()
    
    # Setup retrievers
    embed_model = HuggingFaceEmbedding("BAAI/bge-small-en-v1.5")
    vector_index = VectorStoreIndex.from_documents(financial_docs, embed_model=embed_model)
    
    vector_retriever = vector_index.as_retriever(similarity_top_k=3)
    bm25_retriever = BM25Retriever.from_defaults(nodes=financial_docs, similarity_top_k=3)
    
    # Initialize Groq
    llm = initialize_groq()
    
    # Interactive loop
    while True:
        query = input("\nEnter your financial question (type 'exit' to quit): ")
        if query.lower() in ['exit', 'quit']:
            break
        
        # Retrieve results
        vector_results = vector_retriever.retrieve(query)
        bm25_results = bm25_retriever.retrieve(query)
        
        # Combine and deduplicate
        combined_results = {r.node.node_id: r for r in vector_results + bm25_results}
        
        # Prepare context
        contexts = [result.node.text for result in combined_results.values()]
        
        # Generate and display answer
        answer = generate_answer(llm, query, contexts)
        print("\nAI Answer:")
        print(answer)
        time.sleep(1)  # Rate limit protection

if __name__ == "__main__":
    main()


AI Answer:
The key findings about household income in 2023 are as follows:

1. A sizeable share of adults reported that their family's monthly income increased in 2023 compared to the previous year. However, an even greater share of adults indicated that their spending increased from the prior year.
2. The proportion of adults who spent less than their income in the month before the survey remained lower than the level it had been before the pandemic, suggesting that fewer adults have margin in their family budgets.
3. Nineteen percent of adults had a family income below $25,000, and 37 percent had a family income of $100,000 or more.
4. Although labor earnings were the most common source of income, many people had other sources of income. Two-thirds of adults received labor income, and 55 percent of all adults received non-labor income in 2023.

Here is a table summarizing the financial well-being of households by income brackets in 2023:

| Income Bracket   | Financial Well-being (d