<a href="https://colab.research.google.com/github/Poojav21/FinQAgent-Finance-Query-Agent-/blob/main/Financial_RAG_system_with_agent_capabilities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r requirements.txt

Collecting PyPDF2>=1.26.0 (from -r requirements.txt (line 11))
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting chromadb>=0.3.21 (from -r requirements.txt (line 13))
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting unstructured>=0.6.7 (from -r requirements.txt (line 14))
  Downloading unstructured-0.18.15-py3-none-any.whl.metadata (24 kB)
Collecting pdf2image>=1.16.0 (from -r requirements.txt (line 15))
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pdfplumber>=0.5.28 (from -r requirements.txt (line 17))
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF>=1.18.0 (from -r requirements.txt (line 18))
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu (from -r r

In [None]:
import os
import json
import re
from typing import List, Dict, Any, Tuple
import numpy as np
from pathlib import Path
import logging
import google.generativeai as genai
import faiss
import pickle
import PyPDF2
from sentence_transformers import SentenceTransformer
import torch

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
class Document:
    """Simple document class"""
    def __init__(self, content: str, company: str, year: str, chunk_id: str = None):
        self.content = content
        self.company = company
        self.year = year
        self.chunk_id = chunk_id or f"{company}_{year}"

class PDFProcessor:
    """Process PDF files and extract text"""

    def __init__(self, data_dir: str = "data"):
        self.data_dir = Path(data_dir)

    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        try:
            text = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"

            # Clean the text
            text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
            text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', text)  # Remove control chars

            return text.strip()

        except Exception as e:
            logger.error(f"Error extracting text from {file_path}: {e}")
            return ""

    def chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
        """Split text into overlapping chunks"""
        if len(text) <= chunk_size:
            return [text]

        chunks = []
        start = 0

        while start < len(text):
            end = start + chunk_size

            # Try to break at sentence boundary
            if end < len(text):
                sentence_end = text.rfind('.', start + chunk_size - 200, end)
                if sentence_end > start:
                    end = sentence_end + 1

            chunk = text[start:end].strip()
            if len(chunk) > 100:  # Only keep substantial chunks
                chunks.append(chunk)

            if end >= len(text):
                break

            start = end - overlap

        return chunks

    def process_all_pdfs(self) -> List[Document]:
        """Process all PDFs in the organized folder structure"""
        documents = []

        # Expected structure: data/COMPANY/YEAR.pdf
        for company_dir in self.data_dir.iterdir():
            if not company_dir.is_dir():
                continue

            company = company_dir.name
            logger.info(f"Processing company: {company}")

            for pdf_file in company_dir.glob("*.pdf"):
                year = pdf_file.stem  # Get filename without extension

                logger.info(f"Processing {company} {year}...")

                text = self.extract_text_from_pdf(str(pdf_file))
                if text:
                    chunks = self.chunk_text(text)
                    logger.info(f"Created {len(chunks)} chunks for {company} {year}")

                    for i, chunk in enumerate(chunks):
                        doc = Document(
                            content=chunk,
                            company=company,
                            year=year,
                            chunk_id=f"{company}_{year}_chunk_{i}"
                        )
                        documents.append(doc)

        logger.info(f"Total documents created: {len(documents)}")
        return documents


In [None]:
class TransformerEmbedder:
    """Handle embeddings using SentenceTransformers"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize with a lightweight transformer model
        all-MiniLM-L6-v2: Fast, good quality, 384 dimensions
        """
        self.model_name = model_name
        logger.info(f"Loading transformer model: {model_name}")

        # Load the model
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()

        logger.info(f"Model loaded. Embedding dimension: {self.dimension}")

    def get_embedding(self, text: str) -> np.ndarray:
        """Get embedding for text"""
        try:
            # Truncate if too long (most models have token limits)
            max_chars = 8000
            if len(text) > max_chars:
                text = text[:max_chars]

            # Get embedding
            embedding = self.model.encode(text, convert_to_numpy=True)
            return embedding.astype(np.float32)

        except Exception as e:
            logger.error(f"Error getting embedding: {e}")
            # Return zero vector as fallback
            return np.zeros(self.dimension, dtype=np.float32)

    def get_batch_embeddings(self, texts: List[str]) -> np.ndarray:
        """Get embeddings for multiple texts at once (more efficient)"""
        try:
            # Truncate texts if too long
            max_chars = 8000
            truncated_texts = []
            for text in texts:
                if len(text) > max_chars:
                    text = text[:max_chars]
                truncated_texts.append(text)

            # Get embeddings in batch
            embeddings = self.model.encode(
                truncated_texts,
                convert_to_numpy=True,
                show_progress_bar=True
            )
            return embeddings.astype(np.float32)

        except Exception as e:
            logger.error(f"Error getting batch embeddings: {e}")
            # Return zero vectors as fallback
            return np.zeros((len(texts), self.dimension), dtype=np.float32)

In [None]:
class FAISSVectorStore:
    """FAISS-based vector store with transformer embeddings"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.embedder = TransformerEmbedder(model_name)
        self.dimension = self.embedder.dimension
        self.index = faiss.IndexFlatIP(self.dimension)  # Inner product for cosine similarity
        self.documents: List[Document] = []

    def add_documents(self, documents: List[Document]):
        """Add documents to the vector store"""
        if not documents:
            return

        logger.info(f"Creating embeddings for {len(documents)} documents...")

        # Extract texts for batch processing
        texts = [doc.content for doc in documents]

        # Get embeddings in batches for efficiency
        batch_size = 32
        all_embeddings = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            logger.info(f"Processing batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")

            batch_embeddings = self.embedder.get_batch_embeddings(batch_texts)

            # Normalize for cosine similarity
            norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
            norms[norms == 0] = 1  # Avoid division by zero
            batch_embeddings = batch_embeddings / norms

            all_embeddings.append(batch_embeddings)

        # Combine all embeddings
        embeddings_array = np.vstack(all_embeddings)

        # Add to FAISS index
        self.index.add(embeddings_array)

        # Store documents
        self.documents.extend(documents)

        logger.info(f"Added {len(documents)} documents to vector store")

    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        """Search for similar documents"""
        if len(self.documents) == 0:
            return []

        # Get query embedding
        query_embedding = self.embedder.get_embedding(query)

        # Normalize
        norm = np.linalg.norm(query_embedding)
        if norm > 0:
            query_embedding = query_embedding / norm

        # Search
        scores, indices = self.index.search(
            query_embedding.reshape(1, -1),
            min(k, len(self.documents))
        )

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # Valid result
                results.append((self.documents[idx], float(score)))

        return results

    def save(self, path: str):
        """Save vector store"""
        # Save FAISS index
        faiss.write_index(self.index, f"{path}.faiss")

        # Save documents and model info
        store_data = {
            'documents': self.documents,
            'model_name': self.embedder.model_name,
            'dimension': self.dimension
        }

        with open(f"{path}.pkl", 'wb') as f:
            pickle.dump(store_data, f)

        logger.info(f"Saved vector store to {path}")

    def load(self, path: str):
        """Load vector store"""
        try:
            # Load documents and model info
            with open(f"{path}.pkl", 'rb') as f:
                store_data = pickle.load(f)

            self.documents = store_data['documents']
            stored_model = store_data.get('model_name', 'all-MiniLM-L6-v2')

            # Reinitialize embedder if needed
            if stored_model != self.embedder.model_name:
                logger.info(f"Reinitializing embedder with stored model: {stored_model}")
                self.embedder = TransformerEmbedder(stored_model)
                self.dimension = self.embedder.dimension

            # Load FAISS index
            self.index = faiss.read_index(f"{path}.faiss")

            logger.info(f"Loaded vector store with {len(self.documents)} documents")
            return True

        except Exception as e:
            logger.error(f"Error loading vector store: {e}")
            return False

In [None]:
class SimpleRAG:
    """Main RAG system with transformer embeddings"""

    def __init__(self, data_dir: str = "data", embedding_model: str = "all-MiniLM-L6-v2"):
        self.data_dir = data_dir
        self.embedding_model = embedding_model
        self.pdf_processor = PDFProcessor(data_dir)
        self.vector_store = FAISSVectorStore(embedding_model)

        # Initialize Gemini for generation only
        api_key = "AIzaSyDyfIfhUUxncND0b4dQCdsIUSfwpeoIrF8"


        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.5-flash')  # Example model

    def setup(self, force_rebuild: bool = False):
        """Set up the RAG system"""
        vector_store_path = Path(self.data_dir) / "vector_store"

        # Try to load existing vector store
        if not force_rebuild and self.vector_store.load(str(vector_store_path)):
            logger.info("Loaded existing vector store")
            return

        # Process documents and build new vector store
        logger.info("Building new vector store...")
        documents = self.pdf_processor.process_all_pdfs()

        if not documents:
            raise RuntimeError(
                f"No documents found in {self.data_dir}. "
                "Make sure you have PDFs organized as: data/COMPANY/YEAR.pdf"
            )

        self.vector_store.add_documents(documents)
        self.vector_store.save(str(vector_store_path))

        logger.info("Setup complete!")

    def query(self, question: str) -> Dict[str, Any]:
        """Answer a question using RAG"""
        # Search for relevant documents
        results = self.vector_store.search(question, k=5)

        if not results:
            return {
                "question": question,
                "answer": "No relevant documents found.",
                "sources": []
            }

        # Prepare context
        context_parts = []
        sources = []

        for doc, score in results[:3]:  # Use top 3 results
            context_parts.append(f"Company: {doc.company}, Year: {doc.year}\n{doc.content}")
            sources.append({
                "company": doc.company,
                "year": doc.year,
                "score": score,
                "preview": doc.content[:200] + "..."
            })

        context = "\n\n" + "="*50 + "\n\n".join(context_parts)

        # Generate answer
        prompt = f"""
Based on the following financial documents, please answer the question accurately.

Question: {question}

Context from 10-K filings:
{context}

Instructions:
- Provide a specific, accurate answer based on the context
- Include specific numbers, percentages, or financial figures when available
- Mention the company and year for any data you reference
- If the information is not available in the context, say so clearly
- Be concise but comprehensive

Answer:
"""

        try:
            response = self.model.generate_content(prompt)
            answer = response.text.strip()
        except Exception as e:
            logger.error(f"Error generating answer: {e}")
            answer = "Sorry, I encountered an error while generating the answer."

        return {
            "question": question,
            "answer": answer,
            "sources": sources
        }

    def get_stats(self) -> Dict[str, Any]:
        """Get system statistics"""
        if not self.vector_store.documents:
            return {"error": "No documents loaded"}

        companies = set(doc.company for doc in self.vector_store.documents)
        years = set(doc.year for doc in self.vector_store.documents)

        return {
            "total_documents": len(self.vector_store.documents),
            "companies": sorted(list(companies)),
            "years": sorted(list(years)),
            "embedding_model": self.embedding_model,
            "embedding_dimension": self.vector_store.dimension
        }

In [None]:
class FinancialAgent:
    """Agent that decomposes queries and orchestrates RAG calls"""

    def __init__(self, rag: SimpleRAG):
        self.rag = rag

    def detect_query_type(self, query: str) -> str:
        """Naive rule-based detection of query type"""
        q = query.lower()
        if "compare" in q or "across" in q:
            return "cross_company"
        if "from 2022 to 2023" in q or "yoy" in q:
            return "yoy_comparison"
        if "percentage" in q or "came from" in q:
            return "segment_analysis"
        if "ai" in q:
            return "ai_strategy"
        return "basic"

    def decompose_query(self, query: str) -> List[str]:
        """Return sub-queries depending on type"""
        qtype = self.detect_query_type(query)

        if qtype == "basic":
            return [query]

        if qtype == "yoy_comparison":
            # Example: NVIDIA data center revenue growth 2022-2023
            m = re.search(r"(nvidia|microsoft|google).*?(\w+ revenue)", query.lower())
            company, metric = (m.group(1), m.group(2)) if m else ("company", "revenue")
            return [f"{company} {metric} 2022", f"{company} {metric} 2023"]

        if qtype == "cross_company":
            if "operating margin" in query.lower():
                return [
                    "Microsoft operating margin 2023",
                    "Google operating margin 2023",
                    "NVIDIA operating margin 2023"
                ]
            if "gross margin" in query.lower():
                return [
                    "Microsoft gross margin 2023",
                    "Google gross margin 2023",
                    "NVIDIA gross margin 2023"
                ]

        if qtype == "segment_analysis":
            return [query]  # Single retrieval but parsed differently

        if qtype == "ai_strategy":
            return [
                "Microsoft AI investments 2024",
                "Google AI investments 2024",
                "NVIDIA AI investments 2024"
            ]

        return [query]

    def query(self, question: str) -> Dict[str, Any]:
        """Main entrypoint for agent reasoning"""
        sub_queries = self.decompose_query(question)
        sub_results = []

        for sq in sub_queries:
            res = self.rag.query(sq)
            sub_results.append(res)

        # Prepare synthesis context
        context = "\n\n".join([r["answer"] for r in sub_results])
        synthesis_prompt = f"""
You are analyzing financial 10-K data.

Main question: {question}

Sub-answers:
{context}

Now synthesize a final structured answer.
- Be concise and accurate.
- If numbers are available, compute comparisons (YoY %, max value, etc.).
- Return reasoning too.
"""
        try:
            response = self.rag.model.generate_content(synthesis_prompt)
            final_answer = response.text.strip()
        except Exception as e:
            logger.error(f"Error synthesizing answer: {e}")
            final_answer = "Error during synthesis."

        # Collect sources from all sub-results
        sources = []
        for r in sub_results:
            sources.extend(r["sources"])

        return {
            "query": question,
            "answer": final_answer,
            "reasoning": f"Synthesized from {len(sub_queries)} sub-queries.",
            "sub_queries": sub_queries,
            "sources": sources
        }


In [None]:
def main():
    """Main function"""
    print("Simple Financial RAG System with Transformer Embeddings + Agent")
    print("=" * 65)

    try:
        # Select embedding model
        print("Available embedding models:")
        print("1. all-MiniLM-L6-v2 (default) - Fast, lightweight, 384 dimensions")
        print("2. all-mpnet-base-v2 - Better quality, 768 dimensions")
        print("3. multi-qa-MiniLM-L6-cos-v1 - Optimized for Q&A")

        model_choice = input("\nChoose model (1-3) or press Enter for default: ").strip()

        if model_choice == "2":
            embedding_model = "all-mpnet-base-v2"
        elif model_choice == "3":
            embedding_model = "multi-qa-MiniLM-L6-cos-v1"
        else:
            embedding_model = "all-MiniLM-L6-v2"

        print(f"Using embedding model: {embedding_model}")

        rag = SimpleRAG(embedding_model=embedding_model)
        agent = FinancialAgent(rag)

        # Setup (will process PDFs if needed)
        print("\nSetting up RAG system...")
        rag.setup()

        # Show stats
        stats = rag.get_stats()
        print(f"\nSystem ready!")
        print(f"Documents: {stats['total_documents']}")
        print(f"Companies: {', '.join(stats['companies'])}")
        print(f"Years: {', '.join(stats['years'])}")
        print(f"Embedding model: {stats['embedding_model']}")
        print(f"Embedding dimension: {stats['embedding_dimension']}")
        print("=" * 65)

        # Mode choice
        mode = input("\nSelect mode: [1] Simple RAG, [2] Agent Orchestration: ").strip()
        use_agent = (mode == "2")

        print("\nInteractive Mode (type 'quit' to exit):")
        while True:
            question = input("\nYour question: ").strip()
            if question.lower() in ['quit', 'exit', 'q']:
                break
            if not question:
                continue

            print("Processing query...")
            if use_agent:
                result = agent.query(question)
            else:
                result = rag.query(question)

            # Pretty-print result
            print(f"\nAnswer: {result['answer']}")
            if "reasoning" in result:
                print(f"Reasoning: {result['reasoning']}")
            if result.get("sub_queries"):
                print(f"Sub-queries: {result['sub_queries']}")
            if result['sources']:
                print(f"\nSources ({len(result['sources'])}):")
                for source in result['sources']:
                    print(f"- {source['company']} {source['year']} (score: {source.get('score',0):.3f})")

    except Exception as e:
        print(f"Error: {e}")
        print("\nMake sure:")
        print("1. Install required packages: pip install sentence-transformers faiss-cpu google-generativeai PyPDF2")
        print("2. Your data directory is structured as:")
        print("   data/")
        print("     ├─ GOOGL/2022.pdf")
        print("     ├─ MSFT/2023.pdf")
        print("     └─ NVDA/2024.pdf")


In [None]:
# Initialize RAG
rag = SimpleRAG(embedding_model="all-MiniLM-L6-v2")  # or "all-mpnet-base-v2"
rag.setup()

# Initialize Agent
agent = FinancialAgent(rag)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
result = agent.query("How did NVIDIA data center revenue grow from 2022 to 2023?")
print("result", result)
print("Answer:", result["answer"])
print("Reasoning:", result["reasoning"])
print("Sub-queries:", result["sub_queries"])
print("Sources:", result["sources"])


result {'query': 'How did NVIDIA data center revenue grow from 2022 to 2023?', 'answer': "NVIDIA's Data Center revenue grew significantly from fiscal year 2022 to fiscal year 2023.\n\n**Reasoning and Calculation:**\n\n*   **Fiscal Year 2022 Data Center Revenue:** $11,046 million (or $11.046 billion)\n*   **Fiscal Year 2023 Data Center Revenue:** $15,010 million (or $15.01 billion)\n\nTo calculate the growth:\n\n1.  **Absolute Increase:** $15,010 million - $11,046 million = $3,964 million\n2.  **Percentage Increase (YoY):** ($3,964 million / $11,046 million) * 100 = **35.88%** (approximately 35.9%)\n\n**Final Answer:**\n\nNVIDIA's Data Center revenue grew from **$11,046 million** in fiscal year 2022 to **$15,010 million** in fiscal year 2023. This represents an increase of **$3,964 million**, or approximately **35.9%** year-over-year.\n\n*(Note: While the sub-answers mentioned a 41% increase, calculations based on the provided absolute revenue figures for FY2022 and FY2023 result in a g

In [1]:
!pip install -r requirements.txt

Collecting PyPDF2>=1.26.0 (from -r requirements.txt (line 11))
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting chromadb>=0.3.21 (from -r requirements.txt (line 13))
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting unstructured>=0.6.7 (from -r requirements.txt (line 14))
  Downloading unstructured-0.18.15-py3-none-any.whl.metadata (24 kB)
Collecting pdf2image>=1.16.0 (from -r requirements.txt (line 15))
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pdfplumber>=0.5.28 (from -r requirements.txt (line 17))
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF>=1.18.01 (from -r requirements.txt (line 18))
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu (from -r 

In [8]:
"""
Financial RAG System with Agent Capabilities
Focused implementation for 10-K document analysis with JSON output format
"""

import os
import json
import re
from typing import List, Dict, Any, Tuple
import numpy as np
from pathlib import Path
import logging
import google.generativeai as genai
import faiss
import pickle
import PyPDF2
from sentence_transformers import SentenceTransformer

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangChainDocument
from langchain.agents import AgentExecutor, create_react_agent
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain import hub
from langchain.tools import BaseTool

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class GeminiLLM(LLM):
    """Simple Gemini LLM wrapper for LangChain"""
    model: Any = None

    def __init__(self, api_key: str, **kwargs: Any):
        super().__init__(**kwargs)
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash')

    @property
    def _llm_type(self) -> str:
        return "gemini"

    def _call(self, prompt: str, stop: List[str] = None, **kwargs: Any) -> str:
        try:
            response = self.model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            logger.error(f"Error generating content with Gemini: {e}")
            return "An error occurred while generating the response."


class PDFProcessor:
    """Process PDF files and extract text"""

    def __init__(self, data_dir: str = "data"):
        self.data_dir = Path(data_dir)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", ".", " "],
        )

    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    if page_text:
                        text += f"[PAGE {page_num + 1}] {page_text}\n"
        except Exception as e:
            logger.error(f"Error processing PDF {file_path}: {e}")
            return ""

        # Clean text
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def process_all_pdfs(self) -> List[LangChainDocument]:
        """Process all PDFs in data directory"""
        documents = []

        if not self.data_dir.exists():
            logger.error(f"Data directory not found: {self.data_dir}")
            return documents

        for company_dir in self.data_dir.iterdir():
            if not company_dir.is_dir():
                continue

            company = company_dir.name
            logger.info(f"Processing {company}...")

            for pdf_file in company_dir.glob("*.pdf"):
                year = pdf_file.stem

                text = self.extract_text_from_pdf(str(pdf_file))
                if text:
                    chunks = self.text_splitter.split_text(text)

                    for i, chunk in enumerate(chunks):
                        # Extract page number from chunk if available
                        page_match = re.search(r'\[PAGE (\d+)\]', chunk)
                        page_num = int(page_match.group(1)) if page_match else 1

                        doc = LangChainDocument(
                            page_content=chunk,
                            metadata={
                                "company": company,
                                "year": year,
                                "chunk_id": f"{company}_{year}_chunk_{i}",
                                "page": page_num,
                                "source": str(pdf_file)
                            }
                        )
                        documents.append(doc)

        logger.info(f"Created {len(documents)} document chunks")
        return documents

class VectorStore:
    """Simple FAISS vector store"""

    def __init__(self):
        self.embeddings = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.documents = []

    def add_documents(self, documents: List[LangChainDocument]):
        """Add documents to vector store"""
        if not documents:
            logger.warning("No documents to add to vector store.")
            return

        texts = [doc.page_content for doc in documents]

        # Create embeddings
        embeddings = self.embeddings.encode(texts)
        embeddings = embeddings.astype(np.float32)

        # Normalize for cosine similarity
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = embeddings / norms

        # Create FAISS index
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)
        self.index.add(embeddings)

        self.documents = documents
        logger.info(f"Added {len(documents)} documents to vector store")

    def search(self, query: str, k: int = 5) -> List[Tuple[LangChainDocument, float]]:
        """Search for similar documents"""
        if self.index is None:
            logger.error("Vector store not initialized. Run setup() first.")
            return []

        query_embedding = self.embeddings.encode([query]).astype(np.float32)
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        scores, indices = self.index.search(query_embedding, k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.documents): # Safety check
                results.append((self.documents[idx], float(score)))

        return results

    def save(self, path: str):
        """Save vector store"""
        os.makedirs(path, exist_ok=True)
        try:
            faiss.write_index(self.index, f"{path}/index.faiss")
            with open(f"{path}/documents.pkl", 'wb') as f:
                pickle.dump(self.documents, f)
        except Exception as e:
            logger.error(f"Error saving vector store: {e}")

    def load(self, path: str) -> bool:
        """Load vector store"""
        try:
            self.index = faiss.read_index(f"{path}/index.faiss")
            with open(f"{path}/documents.pkl", 'rb') as f:
                self.documents = pickle.load(f)
            return True
        except FileNotFoundError:
            logger.warning("Vector store files not found. A new one will be built.")
            return False
        except Exception as e:
            logger.error(f"Error loading vector store: {e}")
            return False

class QueryDecomposer:
    """Agent for decomposing complex queries into sub-queries"""

    def __init__(self, llm: GeminiLLM):
        self.llm = llm

    def detect_query_type(self, query: str) -> str:
        """Detect the type of query to determine decomposition strategy"""
        query_lower = query.lower()

        if any(word in query_lower for word in ["compare", "across", "all three", "which company"]):
            return "cross_company"
        elif any(phrase in query_lower for phrase in ["from 2022 to 2023", "grow from", "change from"]):
            return "yoy_comparison"
        elif any(word in query_lower for word in ["percentage", "came from", "segment"]):
            return "segment_analysis"
        else:
            return "simple"

    def decompose_query(self, query: str) -> List[str]:
        """Decompose query into sub-queries based on type"""
        query_type = self.detect_query_type(query)

        if query_type == "simple":
            return [query]

        elif query_type == "cross_company":
            companies = ["Microsoft", "Google", "NVIDIA"]
            if "operating margin" in query.lower():
                return [f"{company} operating margin 2023" for company in companies]
            elif "gross margin" in query.lower():
                return [f"{company} gross margin 2023" for company in companies]
            elif "revenue" in query.lower():
                return [f"{company} total revenue 2023" for company in companies]
            elif "r&d" in query.lower():
                return [f"{company} R&D spending 2023" for company in companies]
            else:
                return [f"{company} " + query.split("company")[-1].strip() for company in companies]

        elif query_type == "yoy_comparison":
            companies = ["Microsoft", "Google", "NVIDIA"]
            company = next((c for c in companies if c.lower() in query.lower()), "Microsoft")

            if "data center" in query.lower():
                return [f"{company} data center revenue 2022", f"{company} data center revenue 2023"]
            elif "cloud" in query.lower():
                return [f"{company} cloud revenue 2022", f"{company} cloud revenue 2023"]
            elif "revenue" in query.lower():
                return [f"{company} total revenue 2022", f"{company} total revenue 2023"]

        return [query]

class FinancialRAGTool(BaseTool):
    """LangChain tool for financial document search"""

    # Use BaseTool for a simpler Pydantic model setup.
    name: str = "financial_search"
    description: str = "Search financial 10-K documents for specific company and year data"
    vector_store: Any

    def _run(self, query: str) -> str:
        """Search for financial information"""
        if not self.vector_store:
            return "Vector store not initialized."

        results = self.vector_store.search(query, k=3)

        if not results:
            return "No relevant information found"

        context = []
        for doc, score in results:
            context.append(f"Company: {doc.metadata['company']}, Year: {doc.metadata['year']}\n{doc.page_content[:300]}...")

        return "\n\n".join(context)

class FinancialRAGSystem:
    """Main RAG system with agent capabilities"""

    def __init__(self, data_dir: str = "data"):
        self.data_dir = data_dir
        self.pdf_processor = PDFProcessor(data_dir)
        self.vector_store = VectorStore()

        # Initialize Gemini LLM
        # IMPORTANT: Replace with your actual Gemini API Key
        # This key is just an example and will not work.
        api_key = "AIzaSyBqM4SPQvqxdfkdjhffdyzkgsaXFZaN4W3MWkajf94"
        self.llm = GeminiLLM(api_key=api_key)

        self.decomposer = QueryDecomposer(self.llm)
        self.agent = None

    def setup(self, force_rebuild: bool = False):
        """Setup the RAG system"""
        vector_store_path = "vector_store"

        if not force_rebuild and self.vector_store.load(vector_store_path):
            logger.info("Loaded existing vector store")
        else:
            logger.info("Building new vector store...")
            documents = self.pdf_processor.process_all_pdfs()
            self.vector_store.add_documents(documents)
            self.vector_store.save(vector_store_path)

        # Setup agent with tools
        # Instantiate the tool using Pydantic's keyword argument initialization
        financial_tool = FinancialRAGTool(vector_store=self.vector_store)
        tools = [financial_tool]

        try:
            prompt = hub.pull("hwchase17/react")
            agent = create_react_agent(self.llm, tools, prompt)
            self.agent = AgentExecutor(
                agent=agent,
                tools=tools,
                verbose=True,
                handle_parsing_errors=True,
                max_iterations=3
            )
        except Exception as e:
            logger.warning(f"Agent setup failed: {e}. Using fallback method.")

    def query(self, question: str) -> Dict[str, Any]:
        """Process query and return JSON response"""
        # Decompose query
        sub_queries = self.decomposer.decompose_query(question)

        # Execute sub-queries
        sub_results = []
        sources = []

        for sub_query in sub_queries:
            # Search documents
            results = self.vector_store.search(sub_query, k=3)

            if results:
                # Get best result
                doc, score = results[0]

                # Generate answer for this sub-query
                context = f"Company: {doc.metadata['company']}, Year: {doc.metadata['year']}\n{doc.page_content}"

                prompt = f"""
                Based on this financial document excerpt, answer the specific question.

                Question: {sub_query}

                Document excerpt:
                {context}

                Provide a specific, numerical answer when possible. If the information isn't available, say so.
                """

                answer = self.llm._call(prompt)
                sub_results.append(answer)

                # Add to sources
                sources.append({
                    "company": doc.metadata["company"],
                    "year": doc.metadata["year"],
                    "excerpt": doc.page_content[:150] + "...",
                    "page": doc.metadata.get("page", 1),
                    "score": score
                })

        # Synthesize final answer
        if len(sub_queries) > 1:
            synthesis_prompt = f"""
            Original question: {question}

            Sub-questions and answers:
            {chr(10).join([f"Q: {sq} A: {sr}" for sq, sr in zip(sub_queries, sub_results)])}

            Provide a comprehensive answer to the original question by synthesizing these results.
            Include specific numbers and comparisons where possible.
            """

            final_answer = self.llm._call(synthesis_prompt)
            reasoning = f"Decomposed query into {len(sub_queries)} sub-questions and synthesized results"
        else:
            final_answer = sub_results[0] if sub_results else "No relevant information found"
            reasoning = "Direct query execution"

        # Return JSON format as specified
        return {
            "query": question,
            "answer": final_answer,
            "reasoning": reasoning,
            "sub_queries": sub_queries,
            "sources": sources
        }

    def get_stats(self) -> Dict[str, Any]:
        """Get system statistics"""
        if not self.vector_store.documents:
            return {
                "total_documents": 0,
                "companies": [],
                "years": []
            }
        companies = set(doc.metadata["company"] for doc in self.vector_store.documents)
        years = set(doc.metadata["year"] for doc in self.vector_store.documents)

        return {
            "total_documents": len(self.vector_store.documents),
            "companies": sorted(list(companies)),
            "years": sorted(list(years))
        }

def run_test_queries():
    """Run the specified test queries"""
    test_queries = [
        # Simple queries
        "What was NVIDIA's total revenue in fiscal year 2024?",
        "What percentage of Google's 2023 revenue came from advertising?",

        # Comparative queries (require agent decomposition)
        "How much did Microsoft's cloud revenue grow from 2022 to 2023?",
        "Which of the three companies had the highest gross margin in 2023?",

        # Complex multi-step queries
        "Compare the R&D spending as a percentage of revenue across all three companies in 2023",
    ]

    rag = FinancialRAGSystem()
    rag.setup()

    print("Financial RAG System Test Results")
    print("=" * 50)

    for i, query in enumerate(test_queries, 1):
        print(f"\n{i}. {query}")
        print("-" * 30)

        result = rag.query(query)

        # Pretty print JSON
        print(json.dumps(result, indent=2))

def main():
    """Main interactive function"""
    print("Financial RAG System with Agent Capabilities")
    print("=" * 50)

    rag = FinancialRAGSystem()

    print("Setting up system...")
    rag.setup()

    stats = rag.get_stats()
    print(f"System ready! Loaded {stats['total_documents']} documents")
    print(f"Companies: {', '.join(stats['companies'])}")
    print(f"Years: {', '.join(stats['years'])}")
    print("\n" + "=" * 50)

    # Option to run test queries
    choice = input("Run test queries? (y/n): ").lower()
    if choice == 'y':
        run_test_queries()
        return

    # Interactive mode
    print("Interactive Mode (type 'quit' to exit):")
    while True:
        question = input("\nYour question: ").strip()
        if question.lower() in ['quit', 'exit', 'q']:
            break
        if not question:
            continue

        print("Processing...")
        result = rag.query(question)

        # Pretty print result
        print("\nResult:")
        print(json.dumps(result, indent=2))

if __name__ == "__main__":
    main()

Financial RAG System with Agent Capabilities
Setting up system...
System ready! Loaded 3580 documents
Companies: Google, Microsoft, Nvidia
Years: 2022, 2023, 2024

Run test queries? (y/n): y
Financial RAG System Test Results

1. What was NVIDIA's total revenue in fiscal year 2024?
------------------------------
{
  "query": "What was NVIDIA's total revenue in fiscal year 2024?",
  "answer": "The document provides NVIDIA's revenue for the years ended January 30, 2022, January 31, 2021, and January 26, 2020. It does not provide the revenue for fiscal year 2024.",
  "reasoning": "Direct query execution",
  "sub_queries": [
    "What was NVIDIA's total revenue in fiscal year 2024?"
  ],
  "sources": [
    {
      "company": "Nvidia",
      "year": "2022",
      "excerpt": ". 46 [PAGE 82] 9/20/25, 11:07 AM nvda-20220130 https://www.sec.gov/Archives/edgar/data/1045810/000104581022000036/nvda-20220130.htm 82/128Table of Con...",
      "page": 82,
      "score": 0.7411227822303772
    }
  ]
}
