In [10]:
"""
SEC 10-Q Filing RAG System
A complete pipeline for loading, processing, embedding, and querying SEC 10-Q filings
using Qdrant vector database and OpenAI GPT-4o.
"""

import os
import re
import uuid
import time
import requests
import gc
import pandas as pd
from transformers import pipeline as transformers_pipeline
from langchain_core.output_parsers import StrOutputParser
from typing import List, Dict, Tuple
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString

# External libraries (install via pip)
from sentence_transformers import SentenceTransformer, CrossEncoder
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams, PointStruct
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer


from dotenv import load_dotenv
load_dotenv()

'''
from google.colab import userdata

# Load secrets from Colab and set them as environment variables
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['QDRANT_URL'] = userdata.get('QDRANT_URL')
os.environ['QDRANT_API_KEY'] = userdata.get('QDRANT_API_KEY')
'''

"\nfrom google.colab import userdata\n\n# Load secrets from Colab and set them as environment variables\nos.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\nos.environ['QDRANT_URL'] = userdata.get('QDRANT_URL')\nos.environ['QDRANT_API_KEY'] = userdata.get('QDRANT_API_KEY')\n"

In [11]:
# ===========================================================================
# PART 1: CONFIGURATION & SETUP
# ===========================================================================

class Config:
    """Configuration class for API keys and model settings"""

    # API Keys - SET THESE BEFORE RUNNING
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    # SEC EDGAR Configuration
    SEC_HEADERS = {'User-Agent': 'SEC10Q-RAG-System research@example.com'}
    CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

    # Model Configuration
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dimensions
    CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" # For re-ranking
    LLM_MODEL = "gpt-4o"

    # Collection Configuration
    COLLECTION_NAME = "sec_filings_10q_GOLDEN_BENCHMARK"
    VECTOR_SIZE = 384  # Dimension for all-MiniLM-L6-v2

    # Chunking Configuration
    CHUNK_SIZE = 800  # Characters per chunk
    CHUNK_OVERLAP = 200  # Overlap between chunks

    # Retrieve more candidates (K=20) for the re-ranker
    RETRIEVAL_TOP_K = 20
    # The final number of chunks (K=5) to send to the LLM
    FINAL_TOP_K = 5

    # Company Tickers
    TICKERS = ['NVDA', 'AAPL', 'MSFT', 'AMZN', 'META', 'GOOGL', 'TSLA', 'ORCL', 'JPM', 'AMD']

In [12]:
# ============================================================================
# PART 2: DOCUMENT LOADING
# ============================================================================

class SECDocumentLoader:
    """Handles fetching and parsing of SEC 10-Q filings"""

    @staticmethod
    def get_recent_10q_metadata(ticker: str, num_filings: int = 4) -> List[Dict[str, str]]:
        """
        Fetches the metadata (links, dates, etc.) for the latest N 10-Q filings.
        Does NOT fetch the actual HTML content.

        Args:
            ticker: The company ticker (e.g., 'AAPL')
            num_filings: The number of recent 10-Q filings to fetch

        Returns:
            List of metadata dictionaries
        """
        print(f"  → Fetching CIK for ticker: {ticker}...")

        # Get CIK mapping
        response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        company_data = response.json()

        # Find CIK
        cik = None
        company_name = None
        for company in company_data.values():
            if company['ticker'] == ticker.upper():
                cik = str(company['cik_str']).zfill(10)
                company_name = company['title']
                break

        if not cik:
            raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")

        print(f"  → Found CIK: {cik} ({company_name})")

        # Fetch submission history
        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        time.sleep(0.1)  # Rate limiting
        response = requests.get(submissions_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        submissions = response.json()

        # Find latest N 10-Q filings metadata
        filings_metadata = []
        for i, form in enumerate(submissions['filings']['recent']['form']):
            if form == '10-Q':
                accession_number = submissions['filings']['recent']['accessionNumber'][i]
                primary_document = submissions['filings']['recent']['primaryDocument'][i]
                filing_date = submissions['filings']['recent']['filingDate'][i]

                accession_number_clean = accession_number.replace('-', '')

                # Construct the filing URL
                filing_url = (
                    f"https://www.sec.gov/Archives/edgar/data/{cik}/"
                    f"{accession_number_clean}/{primary_document}"
                )

                metadata = {
                    'ticker': ticker.upper(),
                    'company_name': company_name,
                    'filing_date': filing_date,
                    'cik': cik,
                    'filing_url': filing_url
                }
                filings_metadata.append(metadata)

                if len(filings_metadata) >= num_filings:
                    break

        if not filings_metadata:
            raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")

        print(f"  → Found {len(filings_metadata)} recent 10-Q filing metadata entries.")
        return filings_metadata

    @staticmethod
    def get_filing_html(filing_url: str) -> str:
        """Fetches the HTML content for a single filing URL."""
        time.sleep(0.1)  # Rate limiting
        response = requests.get(filing_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        return response.text

    @staticmethod
    def _normalize_header_text(text: str) -> str:
        """Normalizes header text to standard format"""
        text = text.strip().upper()

        # Match "PART I" or "PART II"
        part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
        if part_match:
            return re.sub(r'\s+', ' ', part_match.group(1))

        # Match "ITEM 1", "ITEM 1A", etc.
        item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
        if item_match:
            return re.sub(r'\s+', ' ', item_match.group(1))

        return None

    @staticmethod
    def _parse_html_table(table_tag) -> str:
        """
        Converts HTML table to Markdown format with robust column handling
        """
        try:
            # Skip nested tables
            if table_tag.find_parent('table'):
                return ""
            
            markdown_rows = []
            max_cols = 0
            
            # First pass: collect all rows and find max column count
            for tr in table_tag.find_all('tr', recursive=False):
                cells = []
                for cell in tr.find_all(['td', 'th'], recursive=False):
                    # Get text and clean it
                    text = " ".join(cell.get_text(strip=True).split())
                    # Handle colspan
                    colspan = int(cell.get('colspan', 1))
                    cells.extend([text] + [''] * (colspan - 1))
                
                if cells and any(c for c in cells):  # Skip empty rows
                    markdown_rows.append(cells)
                    max_cols = max(max_cols, len(cells))
            
            if not markdown_rows or max_cols == 0:
                return ""
            
            # Normalize all rows to have same column count
            for row in markdown_rows:
                while len(row) < max_cols:
                    row.append("")
                if len(row) > max_cols:
                    row[:] = row[:max_cols]
            
            # Build markdown output
            md_output = []
            
            # Use first row as header (or create generic header)
            header = markdown_rows[0] if markdown_rows else [f"Col{i}" for i in range(max_cols)]
            header = [h if h else f"Column_{i}" for i, h in enumerate(header)]
            
            md_output.append("| " + " | ".join(header) + " |")
            md_output.append("| " + " | ".join(['---'] * len(header)) + " |")
            
            # Add data rows
            for row in markdown_rows[1:]:
                # Ensure row length matches header
                while len(row) < len(header):
                    row.append("")
                row = row[:len(header)]
                md_output.append("| " + " | ".join(row) + " |")
            
            result = "\n" + "\n".join(md_output) + "\n"
            
            return result
            
        except Exception as e:
            print(f"  [HTML Table Parser Error] Failed to parse HTML table: {e}")
            return ""

    @classmethod
    def parse_10q(cls, html_content: str) -> Dict:
        """Parses HTML content into structured dictionary"""
        # Use 'lxml' for better memory efficiency
        soup = BeautifulSoup(html_content, 'lxml')

        potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])

        doc_headers = []
        for header in potential_headers:
            text = header.get_text(strip=True)
            if len(text) > 100:
                continue

            normalized_key = cls._normalize_header_text(text)
            if normalized_key:
                if not header.find_parent('a'):
                    doc_headers.append({'tag': header, 'key': normalized_key})

        if not doc_headers:
            return {}

        parsed_data = defaultdict(lambda: defaultdict(str))
        current_part_key = None

        for i, header_info in enumerate(doc_headers):
            current_key = header_info['key']

            if 'PART' in current_key:
                current_part_key = current_key
                continue

            if 'ITEM' in current_key:
                if not current_part_key:
                    current_part_key = "PART I"

                start_node = header_info['tag']
                end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None

                content_parts = []
                element = start_node.next_element

                while element and element != end_node:
                    if isinstance(element, NavigableString):
                        if not element.find_parent('table'):
                            text = element.strip()
                            if text:
                                content_parts.append(text)
                    elif element.name == 'table':
                        if not element.find_parent('table'):
                            table_markdown = cls._parse_html_table(element)
                            if table_markdown:
                                content_parts.append(table_markdown)

                    element = element.next_element

                full_content = "\n".join(content_parts)
                clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()

                parsed_data[current_part_key][current_key] = clean_content

        return {part: dict(items) for part, items in parsed_data.items()}

In [13]:
# ===========================================================================
# PART 3: TEXT CHUNKING & EMBEDDING
# ===========================================================================
class DocumentProcessor:
    """
    Processes documents into chunks using LangChain's splitter
    and then embeds them in batches.
    """

    def __init__(self, embedding_model_name: str = Config.EMBEDDING_MODEL):
        """Initialize with embedding model and LangChain text splitter"""
        print(f"\n Loading embedding model: {embedding_model_name}")
        self.model = SentenceTransformer(embedding_model_name)
        print(f"   ✓ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})\n")

        # Initialize LangChain's splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            length_function=len,
            add_start_index=False,
        )
        print(f"   ✓ Initialized RecursiveCharacterTextSplitter (chunk: {Config.CHUNK_SIZE}, overlap: {Config.CHUNK_OVERLAP})")

    def generate_document_chunks(self, parsed_data: Dict, metadata: Dict,
                                 embed_batch_size: int = 1024):
        """
        Processes parsed 10-Q data using LangChain's splitter,
        then YIELDS chunk points one by one after batch-embedding.
        Tags table chunks with metadata
        """

        # Convert the parsed dict into a list of LangChain Document objects
        all_docs = []
        for part, items in parsed_data.items():
            for item, content in items.items():
                if not content:
                    continue

                # Check if the content is a markdown table
                # A simple heuristic: starts with | and has --- in the 2nd line
                is_table = False
                if content.strip().startswith("|"):
                    lines = content.strip().split('\n')
                    if len(lines) > 1 and "---" in lines[1]:
                        is_table = True
                
                # Create a metadata dict for *this specific document*
                # before it gets chunked.
                doc_metadata = {
                    'ticker': metadata['ticker'],
                    'company_name': metadata['company_name'],
                    'filing_date': metadata['filing_date'],
                    'filing_url': metadata['filing_url'],
                    'part': part,
                    'item': item,
                    'is_table': is_table
                }

                doc = Document(
                    page_content=content,
                    metadata=doc_metadata
                )
                all_docs.append(doc)

        if not all_docs:
            return

        # Split all documents at once using the LangChain splitter
        print(f"     → Splitting {len(all_docs)} high-level 'Items' into smaller chunks...")
        chunked_docs = self.text_splitter.split_documents(all_docs)
        print(f"     → Generated {len(chunked_docs)} chunks")

        text_batch = []
        metadata_batch = []

        # Consume the list one chunk at a time
        for chunk in chunked_docs:
            text_batch.append(chunk.page_content)
            # The splitter automatically copies metadata to each chunk
            metadata_batch.append(chunk.metadata) 

            # If batch is full, process it
            if len(text_batch) >= embed_batch_size:
                # 1. Embed the entire batch in one call
                embeddings = self.model.encode(text_batch, show_progress_bar=False)

                # 2. Yield each point from the processed batch
                for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                    # The metadata (meta) already contains everything
                    # from the doc_metadata we built above
                    payload = {
                        'text': txt,
                        **meta # Unpack all metadata keys (ticker, item, part, is_table, etc.)
                    }
                    yield PointStruct(
                        id=str(uuid.uuid4()),
                        vector=emb.tolist(),
                        payload=payload
                    )

                # 3. Reset the batch
                text_batch = []
                metadata_batch = []

        if text_batch:
            # 1. Embed the final batch
            embeddings = self.model.encode(text_batch, show_progress_bar=False)

            # 2. Yield each point
            for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                payload = {
                    'text': txt,
                    **meta
                }
                yield PointStruct(
                    id=str(uuid.uuid4()),
                    vector=emb.tolist(),
                    payload=payload
                )

In [14]:
# ===========================================================================
# PART 4: QDRANT VECTOR DATABASE
# ===========================================================================
class QdrantManager:
    """Manages Qdrant vector database operations"""

    def __init__(self):
        """Initialize Qdrant client"""
        print(f"\nConnecting to Qdrant Cloud...")
        self.client = QdrantClient(
            url=Config.QDRANT_URL,
            api_key=Config.QDRANT_API_KEY
        )
        print(f"   ✓ Connected to Qdrant")

    def create_collection(self, collection_name: str = Config.COLLECTION_NAME,
                         vector_size: int = Config.VECTOR_SIZE):
        """
        Create or recreate collection AND set up payload indexes
        --- MODIFIED: 'is_table' index removed ---
        """
        print(f"\n Setting up collection: {collection_name}")

        collections = self.client.get_collections().collections
        exists = any(col.name == collection_name for col in collections)

        if exists:
            print(f"   ⚠ Collection exists, recreating...")
            self.client.delete_collection(collection_name)

        self.client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
        print(f"   ✓ Collection created")

        print(f"   → Creating payload index for 'ticker'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="ticker",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   → Creating payload index for 'item'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="item",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        
        print(f"   ✓ Payload indexes created.")

    def upsert_documents(self, points_generator,
                        collection_name: str = Config.COLLECTION_NAME,
                        batch_size: int = 2048) -> int:
        """Uploads document chunks from a generator in batches."""
        print(f" Uploading chunks to Qdrant in batches of {batch_size}...")
        batch = []
        count = 0
        for point in points_generator:
            batch.append(point)
            if len(batch) >= batch_size:
                self.client.upsert(
                    collection_name=collection_name,
                    points=batch,
                    wait=False
                )
                count += len(batch)
                print(f"     → Uploaded {count} chunks so far...")
                batch = []
        if batch:
            self.client.upsert(
                collection_name=collection_name,
                points=batch,
                wait=False
            )
            count += len(batch)
        print(f"  ✓ All chunks uploaded for this document. Total: {count}")
        return count

    def search(self, query_vector: List[float],
              collection_name: str = Config.COLLECTION_NAME,
              limit: int = Config.RETRIEVAL_TOP_K,
              filter_dict: Dict = None) -> List[Dict]:
        """
        Search for similar documents
        --- MODIFIED: Removed boolean filter logic ---
        """
        qdrant_filter = None
        if filter_dict:
            must_conditions = []
            for key, value in filter_dict.items():
                # Standard keyword match for strings (like ticker)
                must_conditions.append(
                    models.FieldCondition(
                        key=key,
                        match=models.MatchValue(value=value)
                    )
                )
            
            qdrant_filter = models.Filter(must=must_conditions)

        results = self.client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=limit,
            query_filter=qdrant_filter,
            with_payload=True
        )

        return [
            {'score': result.score, 'payload': result.payload}
            for result in results
        ]

In [15]:
# ===========================================================================
# PART 5: Advancd RAG Engine
# ===========================================================================

class ManualRAGEngine:
    """
    This is the manual RAG query engine, ENHANCED with
    a Cross-Encoder for re-ranking.
    --- MODIFIED: TaPas and table routing removed ---
    """

    def __init__(self, document_processor: 'DocumentProcessor', qdrant_manager: 'QdrantManager'):
        print("\n Initializing Manual RAG Query Engine (v2 with Re-ranking)...\n")

        # 1. Get the embedding model
        self.embedding_model = document_processor.model
        print("   ✓ Using existing embedding model from DocumentProcessor")

        # 2. Load the Cross-Encoder Model
        print(f"   → Loading Cross-Encoder: {Config.CROSS_ENCODER_MODEL}...")
        self.cross_encoder = CrossEncoder(Config.CROSS_ENCODER_MODEL)
        print("   ✓ Cross-Encoder model loaded.")

        # 3. Get the Qdrant client
        self.qdrant_manager = qdrant_manager
        print("   ✓ Using existing QdrantManager for search")

        # 4. Initialize the LLM
        self.llm = ChatOpenAI(
            model=Config.LLM_MODEL,
            api_key=Config.OPENAI_API_KEY,
            temperature=0
        )
        print("   ✓ Initialized ChatOpenAI LLM")

        # 5. Create a prompt template
        template = """You are a helpful financial analyst assistant. Your role is to answer questions about SEC 10-Q filings based ONLY on the provided context.
- Base your answer strictly on the provided context from SEC filings
- Cite specific sections (e.g., "According to Item 1A...") when referencing information
- If the answer is not in the context, clearly state that

Context:
<context>
{context}
</context>

Question: {input}

Answer:"""
        self.prompt = ChatPromptTemplate.from_template(template)
        
        print("   ✓ Manual RAG Engine (v2) ready.")


    def _format_context(self, search_results: List[Dict]) -> str:
        """
        Helper function to format retrieved text contexts.
        """
        context_str = ""
        for i, result in enumerate(search_results, 1):
            payload = result.get('payload', {})
            text = payload.get('text', 'No text found')
            item = payload.get('item', 'N/A')
            ticker = payload.get('ticker', 'N/A')
            context_str += f"Source {i} ({ticker} - {item}):\n\"{text}\"\n\n"
        return context_str.strip()


    def query(self, question: str, ticker_filter: str = None):
        print(f"\n Processing query with Manual Engine (v2): '{question}'")

        # 1. Manually embed the query
        print("   → Manually embedding query...")
        query_vector = self.embedding_model.encode(question).tolist()


        # 2. Filter Logic
        final_filter_dict = {}
        if ticker_filter:
            print(f"   → Applying ticker filter: {ticker_filter}")
            final_filter_dict["ticker"] = ticker_filter.upper()
        
        if not final_filter_dict:
            print("   → No filters applied. Searching all documents.")
        
        # 3. Manually search Qdrant (Candidate Retrieval)
        print(f"   → Searching Qdrant (retrieving Top {Config.RETRIEVAL_TOP_K} candidates)...\n")
        search_results = self.qdrant_manager.search(
            query_vector=query_vector,
            limit=Config.RETRIEVAL_TOP_K, # Retrieve 20
            filter_dict=final_filter_dict # Only contains ticker, if present
        )

        if not search_results:
            return {'answer': 'No relevant context was found in the documents to answer this question.', 'sources': []}
        
        print(f"   → Retrieved {len(search_results)} candidates.")

        # 4. Re-ranking
        # All retrieved chunks are now treated as text.
        print(f"   → Re-ranking all {len(search_results)} candidates...")
        
        passages = [result['payload']['text'] for result in search_results]
        query_passage_pairs = [(question, passage) for passage in passages]
        
        print(f"   → Re-ranking {len(passages)} text chunks with Cross-Encoder...")
        cross_encoder_scores = self.cross_encoder.predict(query_passage_pairs)
        
        final_ranked_results = []
        for score, result in zip(cross_encoder_scores, search_results):
            result['rerank_score'] = float(score)
            final_ranked_results.append(result)

        final_ranked_results.sort(key=lambda x: x['rerank_score'], reverse=True)
        
        # Get the final Top-K
        final_top_k_chunks = final_ranked_results[:Config.FINAL_TOP_K]

        # Store payloads for citation
        final_sources_with_scores = []
        for result in final_top_k_chunks:
            source_data = result['payload']
            source_data['retrieval_score'] = result['score']
            source_data['rerank_score'] = result['rerank_score']
            final_sources_with_scores.append(source_data)
            
        print(f"   → Re-ranked. Final {len(final_sources_with_scores)} sources selected.")
        
        # 5. Generation Step
        print("   → Formatting context for LLM...")
        
        # All chunks are text chunks
        formatted_context = self._format_context(final_top_k_chunks)

        if not formatted_context:
             return {'answer': 'Retrieved context, but failed to process it.', 'sources': []}

        final_prompt_message = self.prompt.format_messages(
            context=formatted_context,
            input=question
        )

        # 6. Manually invoke the LLM
        print("   → Sending prompt to LLM...")
        llm_response = self.llm.invoke(final_prompt_message)
        answer = llm_response.content

        # 7. Format sources (using all 5 chunks for citation)
        sources_output = []
        for i, source_data in enumerate(final_sources_with_scores, 1):
            sources_output.append({
                'ticker': source_data.get('ticker'),
                'company': source_data.get('company_name'),
                'item': source_data.get('item'),
                'part': source_data.get('part'),
                # 'is_table': source_data.get('is_table'), <-- REMOVED
                'filing_date': source_data.get('filing_date'),
                'rerank_score': source_data['rerank_score'],
                'retrieval_score': source_data['retrieval_score']
            })

        return {
            'answer': answer,
            'sources': sources_output
        }

In [16]:
# ===========================================================================
# PART 6: MAIN PIPELINE ORCHESTRATOR
# ===========================================================================

class SECFilingRAGPipeline:
    """Main pipeline orchestrator"""

    def __init__(self):
        """Initialize all components"""
        print("=" * 70)
        print("SEC 10-Q FILING RAG SYSTEM (v2 with Re-ranking)")
        print("=" * 70)

        # Ingestion components
        self.loader = SECDocumentLoader()
        self.processor = DocumentProcessor()
        self.qdrant_manager = QdrantManager()

        # Query component
        self.query_engine = None


    def load_and_index_filings(self, tickers: List[str] = Config.TICKERS, num_filings_per_ticker: int = 1):
        """
        MODIFIED: This function now only checks if the Golden Set exists.
        It does NOT perform indexing.
        """
        print(f"\\n{'=' * 70}")
        print(f"CHECKING DATABASE: {Config.COLLECTION_NAME}")
        print(f"{'=' * 70}")
        
        try:
            count = self.qdrant_manager.client.count(Config.COLLECTION_NAME, exact=True)
            if count.count > 0:
                print(f"✓ Collection '{Config.COLLECTION_NAME}' found with {count.count} docs. Proceeding to query.")
            else:
                print(f"✗ ERROR: Collection '{Config.COLLECTION_NAME}' is empty or not found.")
                print("Please run '0_build_database.ipynb' first.")
                raise Exception("Database not found.")
        except Exception as e:
            print(f"✗ ERROR: Could not connect to or find collection '{Config.COLLECTION_NAME}'.")
            print("Please run '0_build_database.ipynb' first.")
            raise e
        
        print(f"{'=' * 70}\\n")


    def query(self, question: str, ticker_filter: str = None):
        """
        Query the indexed filings using the new MANUAL engine
        """
        if self.query_engine is None:
            self.query_engine = ManualRAGEngine(
                document_processor=self.processor,
                qdrant_manager=self.qdrant_manager
            )

        result = self.query_engine.query(question, ticker_filter)

        # Print results
        print(f"\n{'=' * 70}")
        print(f"ANSWER")
        print(f"__{result['answer']}__\n") # Bolding answer

        print(f"{'=' * 70}")
        print(f"SOURCES ({len(result['sources'])} chunks)")
        print(f"{'=' * 70}")
        for i, source in enumerate(result['sources'], 1):
            print(f"\n{i}. {source['company']} ({source['ticker']}) - {source['item']}")
            print(f"   Filing Date: {source['filing_date']}")
            # --- MODIFICATION ---
            # print(f"   Is Table: {source['is_table']}") # <-- REMOVED
            # --- END MODIFICATION ---
            print(f"   Re-Rank Score: {source['rerank_score']:.4f} (Main score)")
            print(f"   Vector Score: {source['retrieval_score']:.4f} (Initial retrieval)")

        print(f"\n{'=' * 70}\n")

        return result

In [17]:
# ===========================================================================
# USAGE EXAMPLE (Running 6 questions and saving all results to one file)
# ===========================================================================
import os
import json

if __name__ == "__main__":

    # =======================================================================
    # 1. INITIALIZE PIPELINE
    # =======================================================================
    pipeline = SECFilingRAGPipeline()

    # =======================================================================
    # 2. LOAD AND INDEX FILINGS
    # =======================================================================
    # --- IMPORTANT ---
    # As we discussed, this model uses a new collection ('sec_filings_10q_v2').
    # You MUST run this line ONCE to create and fill that collection.
    # After the first successful run, you can comment it out.
    
    print("Starting the indexing process... This may take a long time.")
    pipeline.load_and_index_filings(num_filings_per_ticker=4)
    print("Indexing complete. Proceeding to queries.")
    # --- End of Indexing ---

SEC 10-Q FILING RAG SYSTEM (v2 with Re-ranking)

 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
   ✓ Model loaded (dimension: 384)

   ✓ Initialized RecursiveCharacterTextSplitter (chunk: 800, overlap: 200)

Connecting to Qdrant Cloud...
   ✓ Connected to Qdrant
Starting the indexing process... This may take a long time.
CHECKING DATABASE: sec_filings_10q_GOLDEN_BENCHMARK
✓ Collection 'sec_filings_10q_GOLDEN_BENCHMARK' found with 12807 docs. Proceeding to query.
Indexing complete. Proceeding to queries.


In [18]:
# =======================================================================
# DEFINE QUESTIONS AND SAVE ALL ANSWERS TO ONE FILE
# =======================================================================
import os
import json

# Define the output folder
DATA_FOLDER = "data"
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)
    print(f"Created data folder: {DATA_FOLDER}")

# --- Standardized Query Set ---
queries_to_run = [
    {"question": "What are the main risk factors mentioned by each companies?", "ticker_filter": None},
    {"question": "What risks did Apple disclose in their latest 10-Q?", "ticker_filter": None},
    {"question": "Compare the revenue trends of NVIDIA and AMD", "ticker_filter": None},
    {"question": "What was Tesla's R&D spending in the latest quarter?", "ticker_filter": None},
    {"question": "How has Microsoft's operating income changed over the last year?", "ticker_filter": None},
    {"question": "What was the gross profit margin for all companies?", "ticker_filter": None},
]

print(f"\\nRunning {len(queries_to_run)} queries with (Model 2) Post-Retrieval Re-ranking...")

all_results = []

for query in queries_to_run:
    print(f"\\n--- Running Query: {query['question']} ---")
    
    result = pipeline.query(
        question=query["question"],
        ticker_filter=query["ticker_filter"]
    )
    
    query_output = {
        "question": query["question"],
        "ticker_filter_manual": query["ticker_filter"],
        "response": result
    }
    all_results.append(query_output)

# --- Standardized Save Path ---
save_path = os.path.join(DATA_FOLDER, "results_model_2_post_retrieval.json")
print(f"\\n--- All queries complete. Saving all results to: {save_path} ---")

try:
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=4)
    print(f"✓ Successfully saved all answers.")
except Exception as e:
    print(f"✗ Failed to save results: {e}")

print("\\nPipeline run finished.")

\nRunning 6 queries with (Model 2) Post-Retrieval Re-ranking...
\n--- Running Query: What are the main risk factors mentioned by each companies? ---

 Initializing Manual RAG Query Engine (v2 with Re-ranking)...

   ✓ Using existing embedding model from DocumentProcessor
   → Loading Cross-Encoder: cross-encoder/ms-marco-MiniLM-L-6-v2...
   ✓ Cross-Encoder model loaded.
   ✓ Using existing QdrantManager for search
   ✓ Initialized ChatOpenAI LLM
   ✓ Manual RAG Engine (v2) ready.

 Processing query with Manual Engine (v2): 'What are the main risk factors mentioned by each companies?'
   → Manually embedding query...
   → No filters applied. Searching all documents.
   → Searching Qdrant (retrieving Top 20 candidates)...

   → Retrieved 20 candidates.
   → Re-ranking all 20 candidates...
   → Re-ranking 20 text chunks with Cross-Encoder...
   → Re-ranked. Final 5 sources selected.
   → Formatting context for LLM...
   → Sending prompt to LLM...

ANSWER
__According to the provided contex