In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab Notebooks/DSA4213
# !ls -la

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/DSA4213
total 290
-rw------- 1 root root 27866 Nov  9 12:02 01_data_loading_exploration.ipynb
-rw------- 1 root root 70110 Nov  9 12:02 02_ner.ipynb
-rw------- 1 root root  3528 Nov  9 12:02 baseline_no_RAG.ipynb
-rw------- 1 root root 49243 Nov  9 15:27 base_RAG.ipynb
-rw------- 1 root root   395 Nov  9 12:29 .env
drwx------ 2 root root  4096 Nov  9 14:58 __pycache__
-rw------- 1 root root  8708 Nov  9 14:32 query_processor.py
-rw------- 1 root root 32273 Nov  9 14:55 query_processors.ipynb
-rw------- 1 root root 96703 Nov  9 12:02 RAG_post_retrieval.ipynb
-rw------- 1 root root  1082 Nov  9 15:05 requirements.txt


In [29]:
!pip install -r requirements.txt

Collecting requests==2.32.5 (from -r requirements.txt (line 2))
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting beautifulsoup4==4.12.3 (from -r requirements.txt (line 3))
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting lxml==5.3.0 (from -r requirements.txt (line 4))
  Using cached lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting sentence_transformers==5.0.0 (from -r requirements.txt (line 7))
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting openai==2.6.0 (from -r requirements.txt (line 13))
  Using cached openai-2.6.0-py3-none-any.whl.metadata (29 kB)
Collecting langchain==1.0.5 (from -r requirements.txt (line 14))
  Using cached langchain-1.0.5-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain_qdrant==1.1.0 (from -r requirements.txt (line 17))
  Using cached langchain_qdrant-1.1.0-py3-none-any.whl.metadata (2.0 kB)
Collecting langchain_community==0.

In [None]:
"""
SEC 10-Q Filing RAG System
A complete pipeline for loading, processing, embedding, and querying SEC 10-Q filings
using Qdrant vector database and OpenAI GPT-4o.
"""

import os
import re
import uuid
import time
import requests
import gc
from typing import List, Dict, Tuple
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString

# External libraries (install via pip)
from langchain_core.documents import Document # Will be needed for document aggregation

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from openai import OpenAI

from dotenv import load_dotenv
load_dotenv()

import importlib.util, types, pathlib

module_path = pathlib.Path.cwd() /"query_processor.py"
spec = importlib.util.spec_from_file_location("query_processor", str(module_path))
qp_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(qp_mod)

QueryProcessor = qp_mod.QueryProcessor
qp = QueryProcessor()
print("Loaded via spec_from_file_location ✓")

print("QueryProcessor OK")

# from google.colab import userdata

# # Load secrets from Colab and set them as environment variables
# os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
# os.environ['QDRANT_URL'] = userdata.get('QDRANT_URL')
# os.environ['QDRANT_API_KEY'] = userdata.get('QDRANT_API_KEY')




Loaded via spec_from_file_location ✓
QueryProcessor OK


In [31]:
# ============================================================================
# PART 1: CONFIGURATION & SETUP
# ============================================================================

class Config:
    """Configuration class for API keys and model settings"""

    # API Keys - SET THESE BEFORE RUNNING
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    # SEC EDGAR Configuration
    SEC_HEADERS = {'User-Agent': 'SEC10Q-RAG-System research@example.com'}
    CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

    # Model Configuration
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dimensions
    LLM_MODEL = "gpt-4o"

    # Collection Configuration
    COLLECTION_NAME = "sec_filings_10q"
    VECTOR_SIZE = 384  # Dimension for all-MiniLM-L6-v2

    # Chunking Configuration
    CHUNK_SIZE = 800  # Characters per chunk
    CHUNK_OVERLAP = 200  # Overlap between chunks

    # Retrieval Configuration
    TOP_K = 5  # Number of chunks to retrieve

    # Company Tickers
    TICKERS = ['NVDA', 'AAPL', 'MSFT', 'AMZN', 'META', 'GOOGL', 'TSLA', 'ORCL', 'JPM', 'AMD']

In [32]:
# ============================================================================
# PART 2: DOCUMENT LOADING
# ============================================================================

class SECDocumentLoader:
    """Handles fetching and parsing of SEC 10-Q filings"""

    @staticmethod
    def get_recent_10q_metadata(ticker: str, num_filings: int = 4) -> List[Dict[str, str]]:
        """
        Fetches the metadata (links, dates, etc.) for the latest N 10-Q filings.
        Does NOT fetch the actual HTML content.

        Args:
            ticker: The company ticker (e.g., 'AAPL')
            num_filings: The number of recent 10-Q filings to fetch

        Returns:
            List of metadata dictionaries
        """
        print(f"  → Fetching CIK for ticker: {ticker}...")

        # Get CIK mapping
        response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        company_data = response.json()

        # Find CIK
        cik = None
        company_name = None
        for company in company_data.values():
            if company['ticker'] == ticker.upper():
                cik = str(company['cik_str']).zfill(10)
                company_name = company['title']
                break

        if not cik:
            raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")

        print(f"  → Found CIK: {cik} ({company_name})")

        # Fetch submission history
        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        time.sleep(0.1)  # Rate limiting
        response = requests.get(submissions_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        submissions = response.json()

        # Find latest N 10-Q filings metadata
        filings_metadata = []
        for i, form in enumerate(submissions['filings']['recent']['form']):
            if form == '10-Q':
                accession_number = submissions['filings']['recent']['accessionNumber'][i]
                primary_document = submissions['filings']['recent']['primaryDocument'][i]
                filing_date = submissions['filings']['recent']['filingDate'][i]

                accession_number_clean = accession_number.replace('-', '')

                # Construct the filing URL
                filing_url = (
                    f"https://www.sec.gov/Archives/edgar/data/{cik}/"
                    f"{accession_number_clean}/{primary_document}"
                )

                metadata = {
                    'ticker': ticker.upper(),
                    'company_name': company_name,
                    'filing_date': filing_date,
                    'cik': cik,
                    'filing_url': filing_url
                }
                filings_metadata.append(metadata)

                if len(filings_metadata) >= num_filings:
                    break

        if not filings_metadata:
            raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")

        print(f"  → Found {len(filings_metadata)} recent 10-Q filing metadata entries.")
        return filings_metadata

    @staticmethod
    def get_filing_html(filing_url: str) -> str:
        """Fetches the HTML content for a single filing URL."""
        time.sleep(0.1)  # Rate limiting
        response = requests.get(filing_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        return response.text

    @staticmethod
    def _normalize_header_text(text: str) -> str:
        """Normalizes header text to standard format"""
        text = text.strip().upper()

        # Match "PART I" or "PART II"
        part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
        if part_match:
            return re.sub(r'\s+', ' ', part_match.group(1))

        # Match "ITEM 1", "ITEM 1A", etc.
        item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
        if item_match:
            return re.sub(r'\s+', ' ', item_match.group(1))

        return None

    @staticmethod
    def _parse_html_table(table_tag) -> str:
        """Converts HTML table to Markdown format"""
        markdown_rows = []

        for tr in table_tag.find_all('tr'):
            cells = [" ".join(cell.get_text(strip=True).split())
                    for cell in tr.find_all(['td', 'th'])]
            if any(cells):
                markdown_rows.append(cells)

        if not markdown_rows:
            return ""

        md_output = []
        header = markdown_rows[0]
        md_output.append("| " + " | ".join(header) + " |")
        md_output.append("| " + " | ".join(['---'] * len(header)) + " |")

        for row in markdown_rows[1:]:
            while len(row) < len(header):
                row.append("")
            row = row[:len(header)]
            md_output.append("| " + " | ".join(row) + " |")

        return "\n" + "\n".join(md_output) + "\n"

    @classmethod
    def parse_10q(cls, html_content: str) -> Dict:
        """Parses HTML content into structured dictionary"""
        # --- KEY CHANGE ---
        # Use 'lxml' for better memory efficiency
        soup = BeautifulSoup(html_content, 'lxml')

        potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])

        doc_headers = []
        for header in potential_headers:
            text = header.get_text(strip=True)
            if len(text) > 100:
                continue

            normalized_key = cls._normalize_header_text(text)
            if normalized_key:
                if not header.find_parent('a'):
                    doc_headers.append({'tag': header, 'key': normalized_key})

        if not doc_headers:
            return {}

        parsed_data = defaultdict(lambda: defaultdict(str))
        current_part_key = None

        for i, header_info in enumerate(doc_headers):
            current_key = header_info['key']

            if 'PART' in current_key:
                current_part_key = current_key
                continue

            if 'ITEM' in current_key:
                if not current_part_key:
                    current_part_key = "PART I"

                start_node = header_info['tag']
                end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None

                content_parts = []
                element = start_node.next_element

                while element and element != end_node:
                    if isinstance(element, NavigableString):
                        if not element.find_parent('table'):
                            text = element.strip()
                            if text:
                                content_parts.append(text)
                    elif element.name == 'table':
                        if not element.find_parent('table'):
                            table_markdown = cls._parse_html_table(element)
                            if table_markdown:
                                content_parts.append(table_markdown)

                    element = element.next_element

                full_content = "\n".join(content_parts)
                clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()

                parsed_data[current_part_key][current_key] = clean_content

        return {part: dict(items) for part, items in parsed_data.items()}

In [33]:
# ============================================================================
# PART 3: TEXT CHUNKING & EMBEDDING
# ============================================================================
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

class DocumentProcessor:
    """
    Processes documents into chunks using LangChain's splitter
    and then embeds them in batches.
    """

    def __init__(self, embedding_model_name: str = Config.EMBEDDING_MODEL):
        """Initialize with embedding model and LangChain text splitter"""
        print(f"\n Loading embedding model: {embedding_model_name}")
        self.model = SentenceTransformer(embedding_model_name)
        print(f"   ✓ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})")

        # Initialize LangChain's splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            length_function=len,
            add_start_index=False, # Simpler metadata
        )
        print(f"   ✓ Initialized RecursiveCharacterTextSplitter (chunk: {Config.CHUNK_SIZE}, overlap: {Config.CHUNK_OVERLAP})")

    def generate_document_chunks(self, parsed_data: Dict, metadata: Dict,
                                 embed_batch_size: int = 1024):
        """
        Processes parsed 10-Q data using LangChain's splitter,
        then YIELDS chunk points one by one after batch-embedding.
        """

        # Convert the parsed dict into a list of LangChain Document objects
        all_docs = []
        for part, items in parsed_data.items():
            for item, content in items.items():
                if not content:
                    continue

                # Create a metadata dict for *this specific document*
                # before it gets chunked.
                doc_metadata = {
                    'ticker': metadata['ticker'],
                    'company_name': metadata['company_name'],
                    'filing_date': metadata['filing_date'],
                    'filing_url': metadata['filing_url'],
                    'part': part,
                    'item': item
                }

                doc = Document(
                    page_content=content,
                    metadata=doc_metadata
                )
                all_docs.append(doc)

        if not all_docs:
            return # Stop the generator

        # Split all documents at once using the LangChain splitter
        print(f"     → Splitting {len(all_docs)} high-level 'Items' into smaller chunks...")
        chunked_docs = self.text_splitter.split_documents(all_docs)
        print(f"     → Generated {len(chunked_docs)} chunks")

        text_batch = []
        metadata_batch = []

        # Consume the list one chunk at a time
        for chunk in chunked_docs:
            text_batch.append(chunk.page_content)
            # The splitter automatically copies metadata to each chunk
            metadata_batch.append(chunk.metadata)

            # If batch is full, process it
            if len(text_batch) >= embed_batch_size:
                # 1. Embed the entire batch in one call
                embeddings = self.model.encode(text_batch, show_progress_bar=False)

                # 2. Yield each point from the processed batch
                for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                    # The metadata (meta) already contains everything
                    # from the doc_metadata we built above
                    payload = {
                        'text': txt,
                        **meta # Unpack all metadata keys (ticker, item, part, etc.)
                    }
                    yield PointStruct(
                        id=str(uuid.uuid4()),
                        vector=emb.tolist(),
                        payload=payload
                    )

                # 3. Reset the batch
                text_batch = []
                metadata_batch = []

        if text_batch:
            # 1. Embed the final batch
            embeddings = self.model.encode(text_batch, show_progress_bar=False)

            # 2. Yield each point
            for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                payload = {
                    'text': txt,
                    **meta
                }
                yield PointStruct(
                    id=str(uuid.uuid4()),
                    vector=emb.tolist(),
                    payload=payload
                )

In [34]:
# ============================================================================
# PART 4: QDRANT VECTOR DATABASE
# ============================================================================

from qdrant_client import models

class QdrantManager:
    """Manages Qdrant vector database operations"""

    def __init__(self):
        """Initialize Qdrant client"""
        print(f"\nConnecting to Qdrant Cloud...")
        self.client = QdrantClient(
            url=Config.QDRANT_URL,
            api_key=Config.QDRANT_API_KEY
        )
        print(f"   ✓ Connected to Qdrant")

    def create_collection(self, collection_name: str = Config.COLLECTION_NAME,
                         vector_size: int = Config.VECTOR_SIZE):
        """Create or recreate collection AND set up payload indexes"""
        print(f"\n Setting up collection: {collection_name}")

        # Check if collection exists
        collections = self.client.get_collections().collections
        exists = any(col.name == collection_name for col in collections)

        if exists:
            print(f"   ⚠ Collection exists, recreating...")
            self.client.delete_collection(collection_name)

        # Create collection
        self.client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams( # Use models.VectorParams
                size=vector_size,
                distance=models.Distance.COSINE # Use models.Distance
            )
        )
        print(f"   ✓ Collection created")

        print(f"   → Creating payload index for 'ticker'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="ticker",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   → Creating payload index for 'item'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="item",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   ✓ Payload indexes created.")

    def upsert_documents(self, points_generator,
                        collection_name: str = Config.COLLECTION_NAME,
                        batch_size: int = 2048) -> int:
        """
        Uploads document chunks from a generator in batches.

        Args:
            points_generator: A generator that yields PointStructs
            collection_name: Name of the collection
            batch_size: Number of points to upload at once

        Returns:
            Total number of chunks uploaded
        """
        print(f" Uploading chunks to Qdrant in batches of {batch_size}...")

        batch = []
        count = 0

        for point in points_generator:
            batch.append(point)

            if len(batch) >= batch_size:
                self.client.upsert(
                    collection_name=collection_name,
                    points=batch,
                    wait=False # Added for speed
                )
                count += len(batch)
                print(f"     → Uploaded {count} chunks so far...")
                batch = [] # Reset batch

        # Upload any remaining points
        if batch:
            self.client.upsert(
                collection_name=collection_name,
                points=batch,
                wait=False # Added for speed
            )
            count += len(batch)

        print(f"  ✓ All chunks uploaded for this document. Total: {count}")
        return count

    def search(self, query_vector: List[float],
              collection_name: str = Config.COLLECTION_NAME,
              limit: int = Config.TOP_K,
              filter_dict: Dict = None) -> List[Dict]:
        """
        Search for similar documents

        Args:
            query_vector: Embedded query vector
            collection_name: Name of the collection
            limit: Number of results to return
            filter_dict: Optional filter (e.g., {"ticker": "AAPL"})

        Returns:
            List of search results with scores and payloads
        """

        qdrant_filter = None
        if filter_dict:
            qdrant_filter = models.Filter(
                must=[
                    models.FieldCondition(
                        key=key,
                        match=models.MatchValue(value=value)
                    )
                    for key, value in filter_dict.items()
                ]
            )

        results = self.client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=limit,
            query_filter=qdrant_filter,
            with_payload=True
        )

        return [
            {
                'score': result.score,
                'payload': result.payload
            }
            for result in results
        ]

In [38]:
# ============================================================================
# PART 5: RAG QUERY PIPELINE
# ============================================================================
import os
from openai import OpenAI
from qdrant_client import models
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI


class ManualRAGEngine:
    """
    This is the manual RAG query engine that replaces the buggy
    LangChainRAGEngine. It performs the RAG steps manually
    using the components we've already built.
    """

    def __init__(self, document_processor: 'DocumentProcessor', qdrant_manager: 'QdrantManager', query_processor=None):
        """
        Initialize the engine with the processor (for embeddings)
        and the manager (for search).
        """
        print("\n Initializing Manual RAG Query Engine...")

        # 1. Get the embedding model from the document processor
        #    (the one that is already loaded)
        self.embedding_model = document_processor.model
        print("   ✓ Using existing embedding model from DocumentProcessor")

        # 2. Get the Qdrant client from the Qdrant manager
        self.qdrant_manager = qdrant_manager
        print("   ✓ Using existing QdrantManager for search")

        # 3. Optional QueryProcessor
        self.query_processor = query_processor
        if self.query_processor is not None:
            print("   ✓ QueryProcessor attached (query normalization, filters, embeddings)")
        else:
            print("   No QueryProcessor provided (using legacy behavior)")

        # 4. Initialize the LLM
        self.llm = ChatOpenAI(
            model=Config.LLM_MODEL,
            api_key=Config.OPENAI_API_KEY
        )
        print("   ✓ Initialized ChatOpenAI LLM")

        # 5. Create a prompt template (we can still use this part)
        template = """You are a helpful financial analyst assistant. Your role is to answer questions about SEC 10-Q filings based ONLY on the provided context.
- Base your answer strictly on the provided context from SEC filings
- Cite specific sections (e.g., "According to Item 1A...") when referencing information
- If the answer is not in the context, clearly state that

Context:
<context>
{context}
</context>

Question: {input}

Answer:"""
        self.prompt = ChatPromptTemplate.from_template(template)
        print("   ✓ Manual RAG Engine ready.")


    def _format_context(self, search_results: List[Dict]) -> str:
        """Helper function to format the retrieved contexts"""
        context_str = ""
        for i, result in enumerate(search_results, 1):
            payload = result.get('payload', {})
            text = payload.get('text', 'No text found')
            item = payload.get('item', 'N/A')
            ticker = payload.get('ticker', 'N/A')
            context_str += f"Source {i} ({ticker} - {item}):\n\"{text}\"\n\n"
        return context_str.strip()


    def query(self, question: str, ticker_filter: str = None):
        """
        Query the indexed filings using the manual retrieval and generation.
        """
        print(f"\n Processing query with Manual Engine: '{question}'")
        # 1. Manually embed the query
        print("   → Manually embedding query...")
        # --- Query processing first (label, filters, paraphrases, embedding) ---
        if self.query_processor is not None:
            qobj = self.query_processor.process(question)
            # Optional: small trace to console
            print(f"→ Query label: {qobj['label']} | Filters: {qobj['filters']}")

            # Prefer normalized text & the processor’s embedding (model matches your config: all-MiniLM-L6-v2)
            query_text = qobj["normalized"]
            query_vector = qobj["embedding"] if qobj["embedding"] is not None else self.embedding_model.encode(query_text)

            # Build a search filter (your QdrantManager supports 'ticker' today)
            filter_dict = None
            if qobj["filters"].get("ticker"):
                # Use first detected ticker; you can expand to MatchAny later
                filter_dict = {"ticker": qobj["filters"]["ticker"][0]}
            elif qobj["filters"].get("company"):
                # Map company → ticker via the module’s map (only if no explicit ticker provided)
                from query_processor import COMPANY_TICKERS  # same module
                comp = qobj["filters"]["company"][0].lower()
                tkr = COMPANY_TICKERS.get(comp)
                if tkr:
                    filter_dict = {"ticker": tkr}
        else:
            # Fallback: legacy behavior
            query_text = question
            query_vector = self.embedding_model.encode(question)
            filter_dict = None

        # Explicit user override always wins:
        if ticker_filter:
            print(f"→ Overriding with explicit ticker filter: {ticker_filter}")
            filter_dict = {"ticker": ticker_filter}


        # 3. Manually search Qdrant
        print("   → Manually searching Qdrant...")
        search_results = self.qdrant_manager.search(
            query_vector=query_vector,
            limit=Config.TOP_K,
            filter_dict=filter_dict
        )

        if not search_results:
            return {'answer': 'No relevant context was found in the documents to answer this question.', 'sources': []}

        # 4. Manually format the prompt
        print("   → Formatting context and building prompt...")
        formatted_context = self._format_context(search_results)

        # We use the prompt template to create the final message
        final_prompt_message = self.prompt.format_messages(
            context=formatted_context,
            input=question
        )

        # 5. Manually invoke the LLM
        print("   → Sending prompt to LLM...")
        llm_response = self.llm.invoke(final_prompt_message)
        answer = llm_response.content

        # 6. Format sources to match the expected output
        sources = []
        for i, result in enumerate(search_results, 1):
            sources.append({
                'ticker': result['payload'].get('ticker'),
                'company': result['payload'].get('company_name'),
                'item': result['payload'].get('item'),
                'part': result['payload'].get('part'),
                'filing_date': result['payload'].get('filing_date'),
                'score': result['score'] # We get the real score now
            })

        return {
            'answer': answer,
            'sources': sources
        }

In [36]:
# ============================================================================
# PART 6: MAIN PIPELINE ORCHESTRATOR
# ============================================================================

class SECFilingRAGPipeline:
    """Main pipeline orchestrator"""

    def __init__(self):
        """Initialize all components"""
        print("=" * 70)
        print("SEC 10-Q FILING RAG SYSTEM")
        print("=" * 70)

        # Ingestion components (these are working perfectly)
        self.loader = SECDocumentLoader()
        self.processor = DocumentProcessor()
        self.qdrant_manager = QdrantManager()
        #query processing
        self.query_processor= QueryProcessor()
        # Query component (to be initialized later)
        self.query_engine = None


    def load_and_index_filings(self, tickers: List[str] = Config.TICKERS, num_filings_per_ticker: int = 1):
        """
        Load and index 10-Q filings for multiple companies.
        (This method is UNCHANGED and works)
        """
        print(f"\n{'=' * 70}")
        print(f"LOADING & INDEXING PHASE (Streaming)")
        print(f"{'=' * 70}")
        print(f"\nProcessing {len(tickers)} companies: {', '.join(tickers)}")
        print(f"(Fetching {num_filings_per_ticker} filings per company)\n")

        # Create collection (do this once at the start)
        self.qdrant_manager.create_collection() # Use the manager here

        successful_tickers = []
        failed_tickers = []
        total_chunks_indexed = 0

        # Process each ticker
        for idx, ticker in enumerate( tickers, 1):
            print(f"\n[{idx}/{len(tickers)}] Processing {ticker}")
            print("-" * 70)

            ticker_chunks_count = 0
            num_filings_processed = 0

            try:
                # 1. Get METADATA (links) for N filings
                filings_metadata_list = self.loader.get_recent_10q_metadata(ticker, num_filings=num_filings_per_ticker)

                # 2. Loop through each FILING METADATA
                for filing_metadata in filings_metadata_list:
                    try:
                        filing_date = filing_metadata['filing_date']
                        filing_url = filing_metadata['filing_url']

                        # 3. Download ONE filing's HTML
                        print(f"  → Downloading filing from: {filing_date}...")
                        html_content = self.loader.get_filing_html(filing_url)

                        # 4. Parse ONE filing
                        print(f"  → Parsing 10-Q structure...")
                        parsed_data = self.loader.parse_10q(html_content)

                        if not parsed_data:
                            print(f"  ⚠ Warning: No structured data parsed for {ticker} on {filing_date}")
                            del html_content
                            gc.collect()
                            continue

                        # 5. Process ONE filing into a GENERATOR
                        print(f"  → Creating chunks and embeddings generator...")
                        # Use the LANGCHAIN-ENHANCED chunker from Part 3
                        chunks_generator = self.processor.generate_document_chunks(parsed_data, filing_metadata)

                        # 6. Upload ONE filing's chunks from the generator
                        num_uploaded = self.qdrant_manager.upsert_documents(chunks_generator) # Use the manager here

                        if num_uploaded > 0:
                            ticker_chunks_count += num_uploaded
                            total_chunks_indexed += num_uploaded
                            num_filings_processed += 1

                        # 7. MANUALLY CLEAN UP MEMORY
                        print(f"  → Cleaning up memory...")
                        del html_content
                        del parsed_data
                        del chunks_generator # Clear generator
                        gc.collect() # Force garbage collection
                        print(f"  ✓ Memory cleaned.")

                    except Exception as e:
                        print(f"  ✗ Error processing filing {filing_metadata.get('filing_date', 'unknown')} for {ticker}: {str(e)}")
                        gc.collect() # Force cleanup on error

                # After all filings for this ticker are done
                if ticker_chunks_count > 0:
                    successful_tickers.append(ticker)
                    print(f"  ✓ Finished {ticker}. Total chunks: {ticker_chunks_count} across {num_filings_processed} filings")
                else:
                    failed_tickers.append(ticker)
                    print(f"  ⚠ No chunks created for {ticker}")

            except Exception as e:
                print(f"  ✗ Error processing {ticker} (failed to get metadata): {str(e)}")
                failed_tickers.append(ticker)

            # Rate limiting
            if idx < len(tickers):
                time.sleep(0.2)

        # Summary
        print(f"\n{'=' * 70}")
        print(f"INDEXING COMPLETE")
        print(f"{'=' * 70}")
        print(f"✓ Successfully processed: {len(successful_tickers)} companies")
        print(f"  {', '.join(successful_tickers)}")
        if failed_tickers:
            print(f"✗ Failed: {len(failed_tickers)} companies")
            print(f"  {', '.join(failed_tickers)}")
        print(f"\n Total chunks indexed: {total_chunks_indexed}")
        print(f"{'=' * 70}\n")

    def query(self, question: str, ticker_filter: str = None):
        """
        Query the indexed filings using the new MANUAL engine
        """
        if self.query_engine is None:
            # Initialize the ManualRAGEngine, passing it the
            # processor (for the model) and manager (for search)
            self.query_engine = ManualRAGEngine(
                document_processor=self.processor,
                qdrant_manager=self.qdrant_manager,
                query_processor=self.query_processor
            )

        result = self.query_engine.query(question, ticker_filter)

        # Print results (this part is the same)
        print(f"\n{'=' * 70}")
        print(f"ANSWER")
        print(f"{'=' * 70}")
        print(f"\n{result['answer']}\n")

        print(f"{'=' * 70}")
        print(f"SOURCES ({len(result['sources'])} chunks)")
        print(f"{'=' * 70}")
        for i, source in enumerate(result['sources'], 1):
            print(f"\n{i}. {source['company']} ({source['ticker']}) - {source['item']}")
            print(f"   Filing Date: {source['filing_date']}")
            print(f"   Relevance Score: {source['score']:.4f}") # Can now show the real score

        print(f"\n{'=' * 70}\\n")

        return result

In [39]:
# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":

    # ========================================================================
    # INITIALIZE PIPELINE
    # ========================================================================
    pipeline = SECFilingRAGPipeline()

    # ========================================================================
    # LOAD AND INDEX FILINGS
    # ========================================================================
    # This will load the latest 10-Q for each company and index them

    pipeline.load_and_index_filings(num_filings_per_ticker=4)

    # ========================================================================
    # QUERY THE SYSTEM
    # ========================================================================

    # Example 1: General question
    pipeline.query("What are the main risk factors mentioned by tech companies?")

    # Example 2: Company-specific question
    pipeline.query(
        "What risks did Apple disclose in their latest 10-Q?",
        ticker_filter="AAPL"
    )

    # Example 3: Comparative question
    pipeline.query("Compare the revenue trends of NVIDIA and AMD")

    # Example 4: Specific metric question
    pipeline.query("What was Tesla's R&D spending in the latest quarter?")

SEC 10-Q FILING RAG SYSTEM

 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
   ✓ Model loaded (dimension: 384)
   ✓ Initialized RecursiveCharacterTextSplitter (chunk: 800, overlap: 200)

Connecting to Qdrant Cloud...
   ✓ Connected to Qdrant

LOADING & INDEXING PHASE (Streaming)

Processing 10 companies: NVDA, AAPL, MSFT, AMZN, META, GOOGL, TSLA, ORCL, JPM, AMD
(Fetching 4 filings per company)


 Setting up collection: sec_filings_10q
   ⚠ Collection exists, recreating...
   ✓ Collection created
   → Creating payload index for 'ticker'...
   → Creating payload index for 'item'...
   ✓ Payload indexes created.

[1/10] Processing NVDA
----------------------------------------------------------------------
  → Fetching CIK for ticker: NVDA...
  → Found CIK: 0001045810 (NVIDIA CORP)
  → Found 4 recent 10-Q filing metadata entries.
  → Downloading filing from: 2025-08-27...
  → Parsing 10-Q structure...



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')


  → Creating chunks and embeddings generator...
 Uploading chunks to Qdrant in batches of 2048...
     → Splitting 9 high-level 'Items' into smaller chunks...
     → Generated 245 chunks
  ✓ All chunks uploaded for this document. Total: 245
  → Cleaning up memory...
  ✓ Memory cleaned.
  → Downloading filing from: 2025-05-28...
  → Parsing 10-Q structure...
  → Creating chunks and embeddings generator...
 Uploading chunks to Qdrant in batches of 2048...
     → Splitting 9 high-level 'Items' into smaller chunks...
     → Generated 245 chunks
  ✓ All chunks uploaded for this document. Total: 245
  → Cleaning up memory...
  ✓ Memory cleaned.
  → Downloading filing from: 2024-11-20...
  → Parsing 10-Q structure...
  → Creating chunks and embeddings generator...
 Uploading chunks to Qdrant in batches of 2048...
     → Splitting 9 high-level 'Items' into smaller chunks...
     → Generated 257 chunks
  ✓ All chunks uploaded for this document. Total: 257
  → Cleaning up memory...
  ✓ Memory cl

  results = self.client.search(



ANSWER

According to the provided context from AMD's SEC 10-Q filing, the main risk factors mentioned include:

1. **Economic and Strategic Risks**:
   - Intel Corporation's dominance of the microprocessor market and its aggressive business practices, which may limit AMD's ability to compete effectively (Source 1, Source 2, Source 5).
   - The markets in which AMD's products are sold being highly competitive and rapidly evolving (Source 3, Source 4).
   - The semiconductor industry's cyclical nature, having experienced severe downturns (Source 3, Source 4).

2. **General Risks**:
   - AMD's worldwide operations being subject to political, legal, and economic risks, as well as natural disasters (Source 5).
   - Potential future impairments of technology license purchases (Source 5).
   - Challenges in attracting and retaining qualified personnel, which may hinder AMD's business (Source 5).
   - Volatility of AMD's stock price (Source 5).

These risk factors highlight both industry-spec