In [1]:
import os
import re
import uuid
import time
import requests
import gc
import json
import unicodedata
from typing import List, Dict, Tuple, Any, Optional
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString
from dataclasses import dataclass, field

# External libraries
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

import warnings
warnings.filterwarnings("ignore")
load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

In [None]:
# ===========================================================================
# 1. CONFIGURATION
# ===========================================================================
class Config:
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    # Use the User-Agent from your working script
    SEC_HEADERS = {'User-Agent': 'YourAppName youremail@example.com'}
    CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    
    COLLECTION_NAME = "sec_filings_10q_GOLDEN_BENCHMARK" 
    
    VECTOR_SIZE = 384
    CHUNK_SIZE = 800
    CHUNK_OVERLAP = 200
    TICKERS = ['NVDA', 'AAPL', 'MSFT', 'AMZN', 'META', 'GOOGL', 'TSLA', 'ORCL', 'JPM', 'AMD']

In [None]:
# ===========================================================================
# 2. DOCUMENT LOADER
# ===========================================================================
class SECDocumentLoader:
    """
    Enhanced document loader with robust table parsing and validation
    """
    
    @staticmethod
    def get_recent_10q_metadata(ticker: str, num_filings: int = 4) -> List[Dict[str, str]]:
        """
        Fetches the metadata (links, dates, etc.) for the latest N 10-Q filings.
        Does NOT fetch the actual HTML content.

        Args:
            ticker: The company ticker (e.g., 'AAPL')
            num_filings: The number of recent 10-Q filings to fetch

        Returns:
            List of metadata dictionaries
        """
        print(f"  â†’ Fetching CIK for ticker: {ticker}...")

        # Get CIK mapping
        response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        company_data = response.json()

        # Find CIK
        cik = None
        company_name = None
        for company in company_data.values():
            if company['ticker'] == ticker.upper():
                cik = str(company['cik_str']).zfill(10)
                company_name = company['title']
                break

        if not cik:
            raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")

        print(f"  â†’ Found CIK: {cik} ({company_name})")

        # Fetch submission history
        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        time.sleep(0.1)  # Rate limiting
        response = requests.get(submissions_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        submissions = response.json()

        # Find latest N 10-Q filings metadata
        filings_metadata = []
        for i, form in enumerate(submissions['filings']['recent']['form']):
            if form == '10-Q':
                accession_number = submissions['filings']['recent']['accessionNumber'][i]
                primary_document = submissions['filings']['recent']['primaryDocument'][i]
                filing_date = submissions['filings']['recent']['filingDate'][i]

                accession_number_clean = accession_number.replace('-', '')

                # Construct the filing URL
                filing_url = (
                    f"https://www.sec.gov/Archives/edgar/data/{cik}/"
                    f"{accession_number_clean}/{primary_document}"
                )

                metadata = {
                    'ticker': ticker.upper(),
                    'company_name': company_name,
                    'filing_date': filing_date,
                    'cik': cik,
                    'filing_url': filing_url
                }
                filings_metadata.append(metadata)

                if len(filings_metadata) >= num_filings:
                    break

        if not filings_metadata:
            raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")

        print(f"  â†’ Found {len(filings_metadata)} recent 10-Q filing metadata entries.")
        return filings_metadata

    @staticmethod
    def get_filing_html(filing_url: str) -> str:
        """Fetches the HTML content for a single filing URL."""
        time.sleep(0.1)  # Rate limiting
        response = requests.get(filing_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        return response.text
    
    @staticmethod
    def _normalize_header_text(text: str) -> str | None:
        """Normalizes header text to standard format"""
        text = text.strip().upper()
        part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
        if part_match:
            return re.sub(r'\s+', ' ', part_match.group(1))
        item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
        if item_match:
            return re.sub(r'\s+', ' ', item_match.group(1))
        return None

    @staticmethod
    def _validate_markdown_table(markdown_str: str) -> bool:
        """
        Tests if a markdown table can be successfully parsed into a DataFrame
        """
        try:
            lines = [line.strip() for line in markdown_str.strip().split('\n') if line.strip()]
            
            if len(lines) < 3:
                return False
            
            # Find separator
            separator_index = -1
            for i, line in enumerate(lines):
                if '|' in line and '---' in line:
                    cleaned = line.replace('|', '').replace('-', '').replace(' ', '')
                    if len(cleaned) < 5:
                        separator_index = i
                        break
            
            if separator_index == -1 or separator_index == 0:
                return False
            
            # Check header
            header_line = lines[separator_index - 1]
            header = [h.strip() for h in header_line.strip('|').split('|')]
            
            if not header or len(header) < 2:
                return False
            
            # Check at least one data row exists
            if separator_index + 1 >= len(lines):
                return False
            
            # Try to parse with pandas
            import pandas as pd
            df = pd.read_csv(pd.io.common.StringIO(
                '\n'.join([line.strip('|') for line in lines if '|' in line and '---' not in line])
            ), sep='|', skipinitialspace=True)
            
            return not df.empty and df.shape[1] >= 2
            
        except Exception:
            return False

    @staticmethod
    def _parse_html_table(table_tag: BeautifulSoup) -> str:
        """
        Converts HTML table to validated Markdown format
        --- ENHANCED VERSION with validation ---
        """
        try:
            # Skip nested tables
            if table_tag.find_parent('table'):
                return ""
            
            markdown_rows = []
            max_cols = 0
            
            # First pass: collect all rows and find max column count
            for tr in table_tag.find_all('tr', recursive=False):
                cells = []
                for cell in tr.find_all(['td', 'th'], recursive=False):
                    # Clean text
                    text = " ".join(cell.get_text(strip=True).split())
                    # Handle colspan
                    colspan = int(cell.get('colspan', 1))
                    cells.extend([text] + [''] * (colspan - 1))
                
                if cells and any(c for c in cells):  # Skip empty rows
                    markdown_rows.append(cells)
                    max_cols = max(max_cols, len(cells))
            
            if not markdown_rows or max_cols < 2:
                return ""
            
            # Normalize all rows to have same column count
            for row in markdown_rows:
                while len(row) < max_cols:
                    row.append("")
                if len(row) > max_cols:
                    row[:] = row[:max_cols]
            
            # Build markdown
            md_output = []
            
            # Header (use first row or create generic)
            header = markdown_rows[0] if markdown_rows else [f"Col{i}" for i in range(max_cols)]
            header = [h if h else f"Column_{i}" for i, h in enumerate(header)]
            
            md_output.append("| " + " | ".join(header) + " |")
            md_output.append("| " + " | ".join(['---'] * len(header)) + " |")
            
            # Data rows
            for row in markdown_rows[1:]:
                while len(row) < len(header):
                    row.append("")
                row = row[:len(header)]
                md_output.append("| " + " | ".join(row) + " |")
            
            result = "\n" + "\n".join(md_output) + "\n"
            
            # Validate the table before returning
            if not SECDocumentLoader._validate_markdown_table(result):
                return ""
            
            return result
            
        except Exception as e:
            print(f"  [Table Parser Error] {e}")
            return ""

    @classmethod
    def parse_10q(cls, html_content: str) -> dict:
        """
        Parses 10-Q HTML with enhanced table handling
        """
        print("  â†’ Parsing HTML content...")
        soup = BeautifulSoup(html_content, 'html.parser')
        
        potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])
        doc_headers = []
        
        for header in potential_headers:
            text = header.get_text(strip=True)
            if len(text) > 100:
                continue
            
            normalized_key = cls._normalize_header_text(text)
            if normalized_key:
                if not header.find_parent('a'):
                    doc_headers.append({'tag': header, 'key': normalized_key})

        if not doc_headers:
            print("  âš  Warning: Could not find any standard Part/Item headers.")
            return {}

        parsed_data = defaultdict(lambda: defaultdict(str))
        current_part_key = None

        for i, header_info in enumerate(doc_headers):
            current_key = header_info['key']
            
            if 'PART' in current_key:
                current_part_key = current_key
                continue
            
            if 'ITEM' in current_key:
                if not current_part_key:
                    current_part_key = "PART I"
                
                start_node = header_info['tag']
                end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None
                
                content_parts = []
                element = start_node.next_element
                
                while element and element != end_node:
                    if isinstance(element, NavigableString):
                        if not element.find_parent('table'):
                            text = element.strip()
                            if text:
                                content_parts.append(text)
                    elif element.name == 'table':
                        if not element.find_parent('table'):
                            table_markdown = cls._parse_html_table(element)
                            if table_markdown:  # Only add if validation passed
                                content_parts.append(table_markdown)
                    
                    element = element.next_element

                full_content = "\n".join(content_parts)
                clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()
                parsed_data[current_part_key][current_key] = clean_content

        print("  â†’ Parsing complete.")
        return {part: dict(items) for part, items in parsed_data.items()}

In [None]:
# ===========================================================================
# 3. DOCUMENT PROCESSOR
# ===========================================================================
class DocumentProcessor:
    """
    Enhanced processor that handles tables specially to prevent chunking
    """
    
    def __init__(self, embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        print(f"\n Loading embedding model: {embedding_model_name}")
        self.model = SentenceTransformer(embedding_model_name)
        print(f"   âœ“ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})")
        
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,
            chunk_overlap=200,
            length_function=len,
            add_start_index=False
        )
        print(f"   âœ“ Initialized RecursiveCharacterTextSplitter")
    

    def generate_document_chunks(self, parsed_data: Dict, metadata: Dict, 
                                 embed_batch_size: int = 512):
        """
        Process documents (treats all content as text)
        """
        all_docs = []
        
        for part, items in parsed_data.items():
            for item, content in items.items():
                if not content:
                    continue

                doc_metadata = {
                    'ticker': metadata['ticker'],
                    'company_name': metadata['company_name'],
                    'filing_date': metadata['filing_date'],
                    'filing_url': metadata['filing_url'],
                    'part': part,
                    'item': item,
                }
                
                doc = Document(page_content=content, metadata=doc_metadata)
                all_docs.append(doc)

        if not all_docs:
            return
        
        # Process all documents (tables will be chunked just like text)
        print(f"     â†’ Splitting {len(all_docs)} text documents into chunks...")
        chunked_docs = self.text_splitter.split_documents(all_docs)
        print(f"     â†’ Generated {len(chunked_docs)} total chunks")
        
        # Batch embedding
        text_batch = []
        metadata_batch = []
        
        for chunk in chunked_docs:
            text_batch.append(chunk.page_content)
            metadata_batch.append(chunk.metadata)
            
            if len(text_batch) >= embed_batch_size:
                embeddings = self.model.encode(text_batch, show_progress_bar=False)
                
                for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                    payload = {'text': txt, **meta}
                    yield PointStruct(
                        id=str(uuid.uuid4()),
                        vector=emb.tolist(),
                        payload=payload
                    )
                
                text_batch = []
                metadata_batch = []
        
        # Process remaining
        if text_batch:
            embeddings = self.model.encode(text_batch, show_progress_bar=False)
            
            for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                payload = {'text': txt, **meta}
                yield PointStruct(
                    id=str(uuid.uuid4()),
                    vector=emb.tolist(),
                    payload=payload
                )

In [None]:
# ===========================================================================
# 4. QDRANT MANAGER
# ===========================================================================
class QdrantManager:
    def __init__(self):
        print(f"\nConnecting to Qdrant Cloud...")
        self.client = QdrantClient(url=Config.QDRANT_URL, api_key=Config.QDRANT_API_KEY)
        print(f"   âœ“ Connected to Qdrant")
        
    def create_collection(self, collection_name: str = Config.COLLECTION_NAME, vector_size: int = Config.VECTOR_SIZE):
        print(f"\n Setting up collection: {collection_name}")
        try:
            collections = self.client.get_collections().collections
            exists = any(col.name == collection_name for col in collections)
            if exists:
                print(f"   âš  Collection '{collection_name}' exists, recreating...")
                self.client.delete_collection(collection_name)
        except Exception as e:
            print(f"Could not check collections: {e}. Assuming it does not exist.")
            
        self.client.create_collection(collection_name=collection_name, vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE))
        print(f"   âœ“ Collection created")
        
        for field in ["ticker", "item"]:
            print(f"   â†’ Creating payload index for '{field}' (Keyword)...")
            self.client.create_payload_index(collection_name=collection_name, field_name=field, field_schema=models.PayloadSchemaType.KEYWORD)
            
        print(f"   âœ“ Payload indexes created.")

    def upsert_documents(self, points_generator, collection_name: str = Config.COLLECTION_NAME, batch_size: int = 512) -> int:
        
        print(f" Uploading chunks to Qdrant in batches of {batch_size}..."); 
        batch, count = [], 0
        
        for point in points_generator:
            batch.append(point)
            
            if len(batch) >= batch_size:
                self.client.upsert(
                    collection_name=collection_name, 
                    points=batch, 
                    wait=True 
                )
                count += len(batch)
                print(f"     â†’ Uploaded {count} chunks...")
                batch = []
                
        if batch: 
            self.client.upsert(
                collection_name=collection_name, 
                points=batch, 
                wait=True
            )
            count += len(batch)
            
        print(f"  âœ“ All chunks uploaded. Total: {count}"); 
        return count

In [None]:
# ===========================================================================
# 5. MAIN INGESTION PIPELINE
# ===========================================================================
class SECFilingRAGPipeline:
    def __init__(self):
        print("=" * 70); print("SEC 10-Q RAG SYSTEM (Golden Database Builder)"); print("=" * 70)
        self.loader = SECDocumentLoader()
        self.processor = DocumentProcessor()
        self.qdrant_manager = QdrantManager()

    def load_and_index_filings(self, tickers: List[str] = Config.TICKERS, num_filings_per_ticker: int = 4):
        print(f"\n{'=' * 70}\nLOADING & INDEXING PHASE\n{'=' * 70}")
        print(f"\nProcessing {len(tickers)} companies: {', '.join(tickers)}")
        print(f"(Fetching {num_filings_per_ticker} filings per company)\n")
        
        self.qdrant_manager.create_collection()
        
        try:
            count = self.qdrant_manager.client.count(Config.COLLECTION_NAME, exact=True)
            if count.count > 0:
                print(f"Collection '{Config.COLLECTION_NAME}' already has {count.count} docs. SKIPPING INDEXING.")
                print("To re-index, please delete the collection in your Qdrant dashboard.")
                print(f"{'=' * 70}\n"); return
        except Exception: 
            print("Collection is new. Proceeding with indexing.")

        successful_tickers, failed_tickers = [], []; total_chunks_indexed = 0
        for idx, ticker in enumerate( tickers, 1):
            print(f"\n[{idx}/{len(tickers)}] Processing {ticker}"); print("-" * 70)
            ticker_chunks_count, num_filings_processed = 0, 0
            try:
                # 1. Get METADATA (list of URLs, dates, etc.)
                filings_metadata_list = self.loader.get_recent_10q_metadata(ticker, num_filings=num_filings_per_ticker)
                
                for filing_metadata in filings_metadata_list:
                    try:
                        # 2. Get the HTML for ONE filing
                        print(f"  â†’ Downloading filing from: {filing_metadata['filing_date']}..."); 
                        html_content = self.loader.get_filing_html(filing_metadata['filing_url'])
                        
                        # 3. Parse ONE filing
                        parsed_data = self.loader.parse_10q(html_content)
                        if not parsed_data: 
                            print(f"  âš  Warning: No data parsed for {ticker} on {filing_metadata['filing_date']}"); 
                            del html_content; gc.collect(); continue
                        
                        # 4. Chunk and embed
                        print(f"  â†’ Creating chunks and embeddings generator...")
                        chunks_generator = self.processor.generate_document_chunks(parsed_data, filing_metadata)
                        
                        # 5. Upload
                        num_uploaded = self.qdrant_manager.upsert_documents(chunks_generator)
                        if num_uploaded > 0:
                            ticker_chunks_count += num_uploaded; total_chunks_indexed += num_uploaded; num_filings_processed += 1
                        
                        print(f"  â†’ Cleaning up memory..."); del html_content; del parsed_data; del chunks_generator; gc.collect(); print(f"  âœ“ Memory cleaned.")
                    
                    except Exception as e: 
                        print(f"  âœ— Error processing filing for {ticker} ({filing_metadata.get('filing_date', 'unknown')}): {e}"); 
                        gc.collect()
                
                if ticker_chunks_count > 0: 
                    successful_tickers.append(ticker); print(f"  âœ“ Finished {ticker}. Chunks: {ticker_chunks_count}")
                else: 
                    failed_tickers.append(ticker); print(f"  âš  No chunks created for {ticker}")
            
            except Exception as e: 
                print(f"  âœ— Error processing {ticker} (metadata fetch failed): {e}"); 
                failed_tickers.append(ticker)
            
            if idx < len(tickers): time.sleep(0.2)
            
        print(f"\n{'=' * 70}\nINDEXING COMPLETE\n{'=' * 70}")
        print(f"âœ“ Success: {len(successful_tickers)} companies ({', '.join(successful_tickers)})")
        if failed_tickers: print(f"âœ— Failed: {len(failed_tickers)} companies ({', '.join(failed_tickers)})")
        print(f"\n Total chunks indexed: {total_chunks_indexed}\n{'=' * 70}\n")

In [7]:
# ===========================================================================
# 6. RUN THE INGESTION
# ===========================================================================
if __name__ == "__main__":
    pipeline = SECFilingRAGPipeline()
    pipeline.load_and_index_filings(num_filings_per_ticker=4)
    print("Database build complete.")

SEC 10-Q RAG SYSTEM (Golden Database Builder)

ðŸ“¦ Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
   âœ“ Model loaded (dimension: 384)
   âœ“ Initialized RecursiveCharacterTextSplitter

Connecting to Qdrant Cloud...
   âœ“ Connected to Qdrant

LOADING & INDEXING PHASE

Processing 10 companies: NVDA, AAPL, MSFT, AMZN, META, GOOGL, TSLA, ORCL, JPM, AMD
(Fetching 4 filings per company)


 Setting up collection: sec_filings_10q_GOLDEN_BENCHMARK
   âš  Collection 'sec_filings_10q_GOLDEN_BENCHMARK' exists, recreating...
   âœ“ Collection created
   â†’ Creating payload index for 'ticker' (Keyword)...
   â†’ Creating payload index for 'item' (Keyword)...
   âœ“ Payload indexes created.

[1/10] Processing NVDA
----------------------------------------------------------------------
  â†’ Fetching CIK for ticker: NVDA...
  â†’ Found CIK: 0001045810 (NVIDIA CORP)
  â†’ Found 4 recent 10-Q filing metadata entries.
  â†’ Downloading filing from: 2025-08-27...
  â†’ Parsing HTML con