In [1]:
import os
import re
import uuid
import time
import requests
import gc
import json
from typing import List, Dict, Tuple
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString

# External libraries
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams, PointStruct
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Imports for NER
import spacy
from transformers import pipeline

# Suppress warnings (especially from transformers)
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

In [2]:
# ===========================================================================
# PART 1: CONFIGURATION & SETUP
# ===========================================================================

class Config:
    """Configuration class for API keys and model settings"""
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    SEC_HEADERS = {'User-Agent': 'EnhancedRAG-Project research@example.com'}
    CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

    # Model Configuration
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dimensions
    LLM_MODEL = "gpt-4o"
    NER_MODEL = "dslim/bert-base-NER"
    SPACY_MODEL = "en_core_web_lg"

    COLLECTION_NAME = "sec_filings_10q_GOLDEN_BENCHMARK" # New collection name
    VECTOR_SIZE = 384
    CHUNK_SIZE = 800
    CHUNK_OVERLAP = 200
    TOP_K = 5
    TICKERS = ['NVDA', 'AAPL', 'MSFT', 'AMZN', 'META', 'GOOGL', 'TSLA', 'ORCL', 'JPM', 'AMD']

In [3]:
# ===========================================================================
# PART 2: DOCUMENT LOADING
# ===========================================================================

class SECDocumentLoader:
    """Handles fetching and parsing of SEC 10-Q filings"""
    @staticmethod
    def get_recent_10q_metadata(ticker: str, num_filings: int = 4) -> List[Dict[str, str]]:
        print(f"  → Fetching CIK for ticker: {ticker}...")
        response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        company_data = response.json()

        cik = None
        company_name = None
        for company in company_data.values():
            if company['ticker'] == ticker.upper():
                cik = str(company['cik_str']).zfill(10)
                company_name = company['title']
                break
        if not cik:
            raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")
        print(f"  → Found CIK: {cik} ({company_name})")

        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        time.sleep(0.1)
        response = requests.get(submissions_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        submissions = response.json()

        filings_metadata = []
        for i, form in enumerate(submissions['filings']['recent']['form']):
            if form == '10-Q':
                accession_number = submissions['filings']['recent']['accessionNumber'][i]
                primary_document = submissions['filings']['recent']['primaryDocument'][i]
                filing_date = submissions['filings']['recent']['filingDate'][i]
                accession_number_clean = accession_number.replace('-', '')
                filing_url = (
                    f"https://www.sec.gov/Archives/edgar/data/{cik}/"
                    f"{accession_number_clean}/{primary_document}"
                )
                metadata = {
                    'ticker': ticker.upper(),
                    'company_name': company_name,
                    'filing_date': filing_date,
                    'cik': cik,
                    'filing_url': filing_url
                }
                filings_metadata.append(metadata)
                if len(filings_metadata) >= num_filings:
                    break
        if not filings_metadata:
            raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")
        print(f"  → Found {len(filings_metadata)} recent 10-Q filing metadata entries.")
        return filings_metadata

    @staticmethod
    def get_filing_html(filing_url: str) -> str:
        time.sleep(0.1)
        response = requests.get(filing_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        return response.text

    @staticmethod
    def _normalize_header_text(text: str) -> str:
        text = text.strip().upper()
        part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
        if part_match:
            return re.sub(r'\s+', ' ', part_match.group(1))
        item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
        if item_match:
            return re.sub(r'\s+', ' ', item_match.group(1))
        return None

    @staticmethod
    def _parse_html_table(table_tag) -> str:
        markdown_rows = []
        for tr in table_tag.find_all('tr'):
            cells = [" ".join(cell.get_text(strip=True).split())
                    for cell in tr.find_all(['td', 'th'])]
            if any(cells):
                markdown_rows.append(cells)
        if not markdown_rows: return ""
        md_output = []
        header = markdown_rows[0]
        md_output.append("| " + " | ".join(header) + " |")
        md_output.append("| " + " | ".join(['---'] * len(header)) + " |")
        for row in markdown_rows[1:]:
            while len(row) < len(header): row.append("")
            row = row[:len(header)]
            md_output.append("| " + " | ".join(row) + " |")
        return "\n" + "\n".join(md_output) + "\n"

    @classmethod
    def parse_10q(cls, html_content: str) -> Dict:
        # Use 'lxml' for speed and robustness
        soup = BeautifulSoup(html_content, 'lxml')
        potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])
        doc_headers = []
        for header in potential_headers:
            text = header.get_text(strip=True)
            if len(text) > 100: continue
            normalized_key = cls._normalize_header_text(text)
            if normalized_key and not header.find_parent('a'):
                doc_headers.append({'tag': header, 'key': normalized_key})
        if not doc_headers: return {}
        parsed_data = defaultdict(lambda: defaultdict(str))
        current_part_key = None
        for i, header_info in enumerate(doc_headers):
            current_key = header_info['key']
            if 'PART' in current_key:
                current_part_key = current_key
                continue
            if 'ITEM' in current_key:
                if not current_part_key: current_part_key = "PART I"
                start_node = header_info['tag']
                end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None
                content_parts = []
                element = start_node.next_element
                while element and element != end_node:
                    if isinstance(element, NavigableString):
                        if not element.find_parent('table'):
                            text = element.strip()
                            if text: content_parts.append(text)
                    elif element.name == 'table':
                        if not element.find_parent('table'):
                            table_markdown = cls._parse_html_table(element)
                            if table_markdown: content_parts.append(table_markdown)
                    element = element.next_element
                full_content = "\n".join(content_parts)
                clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()
                parsed_data[current_part_key][current_key] = clean_content
        return {part: dict(items) for part, items in parsed_data.items()}

In [4]:
# ===========================================================================
# PART 3: TEXT CHUNKING & EMBEDDING
# ===========================================================================

class DocumentProcessor:
    def __init__(self, embedding_model_name: str = Config.EMBEDDING_MODEL):
        print(f"\n Loading embedding model: {embedding_model_name}")
        self.model = SentenceTransformer(embedding_model_name)
        print(f"   ✓ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})")
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            length_function=len,
            add_start_index=False,
        )
        print(f"   ✓ Initialized RecursiveCharacterTextSplitter")

    def generate_document_chunks(self, parsed_data: Dict, metadata: Dict,
                                 embed_batch_size: int = 1024):
        all_docs = []
        for part, items in parsed_data.items():
            for item, content in items.items():
                if not content: continue
                doc_metadata = {
                    'ticker': metadata['ticker'],
                    'company_name': metadata['company_name'],
                    'filing_date': metadata['filing_date'],
                    'filing_url': metadata['filing_url'],
                    'part': part,
                    'item': item
                }
                doc = Document(page_content=content, metadata=doc_metadata)
                all_docs.append(doc)
        if not all_docs: return
        print(f"     → Splitting {len(all_docs)} high-level 'Items' into smaller chunks...")
        chunked_docs = self.text_splitter.split_documents(all_docs)
        print(f"     → Generated {len(chunked_docs)} chunks")
        text_batch, metadata_batch = [], []
        for chunk in chunked_docs:
            text_batch.append(chunk.page_content)
            metadata_batch.append(chunk.metadata)
            if len(text_batch) >= embed_batch_size:
                embeddings = self.model.encode(text_batch, show_progress_bar=False)
                for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                    payload = {'text': txt, **meta}
                    yield PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload=payload)
                text_batch, metadata_batch = [], []
        if text_batch:
            embeddings = self.model.encode(text_batch, show_progress_bar=False)
            for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                payload = {'text': txt, **meta}
                yield PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload=payload)

In [5]:
# ===========================================================================
# PART 4: QDRANT VECTOR DATABASE
# ===========================================================================

class QdrantManager:
    def __init__(self):
        print(f"\nConnecting to Qdrant Cloud...")
        self.client = QdrantClient(url=Config.QDRANT_URL, api_key=Config.QDRANT_API_KEY)
        print(f"   ✓ Connected to Qdrant")

    def create_collection(self, collection_name: str = Config.COLLECTION_NAME,
                         vector_size: int = Config.VECTOR_SIZE):
        print(f"\n Setting up collection: {collection_name}")
        collections = self.client.get_collections().collections
        exists = any(col.name == collection_name for col in collections)
        if exists:
            print(f"   ⚠ Collection exists, recreating...")
            self.client.delete_collection(collection_name)
        self.client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
        )
        print(f"   ✓ Collection created")
        print(f"   → Creating payload index for 'ticker'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="ticker",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   → Creating payload index for 'item'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="item",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   ✓ Payload indexes created.")

    def upsert_documents(self, points_generator,
                        collection_name: str = Config.COLLECTION_NAME,
                        batch_size: int = 2048) -> int:
        print(f" Uploading chunks to Qdrant in batches of {batch_size}...")
        batch, count = [], 0
        for point in points_generator:
            batch.append(point)
            if len(batch) >= batch_size:
                self.client.upsert(collection_name=collection_name, points=batch, wait=False)
                count += len(batch)
                print(f"     → Uploaded {count} chunks so far...")
                batch = []
        if batch:
            self.client.upsert(collection_name=collection_name, points=batch, wait=False)
            count += len(batch)
        print(f"  ✓ All chunks uploaded for this document. Total: {count}")
        return count

    def search(self, query_vector: List[float],
              collection_name: str = Config.COLLECTION_NAME,
              limit: int = Config.TOP_K,
              ticker_list: List[str] = None) -> List[Dict]:
        """
        Search for similar documents.
        Now accepts a list of tickers to create an 'OR' filter.
        """
        qdrant_filter = None
        if ticker_list:
            if len(ticker_list) == 1:
                # 'must' is efficient for a single item
                qdrant_filter = models.Filter(must=[
                    models.FieldCondition(key="ticker", match=models.MatchValue(value=ticker_list[0]))
                ])
            else:
                # 'should' acts as an 'OR' filter for multiple tickers
                qdrant_filter = models.Filter(should=[
                    models.FieldCondition(key="ticker", match=models.MatchValue(value=ticker))
                    for ticker in ticker_list
                ])
        
        results = self.client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=limit,
            query_filter=qdrant_filter,
            with_payload=True
        )
        return [{'score': result.score, 'payload': result.payload} for result in results]

In [6]:
# ===========================================================================
# PART 5: NER PIPELINE COMPONENTS
# ===========================================================================

class FinancialKnowledgeBase:
    """Comprehensive knowledge base for financial entity normalization"""
    def __init__(self):
        print("   → Initializing Financial Knowledge Base...")
        self.company_to_ticker = self._build_company_ticker_map()
        self.financial_metrics = {
            "revenue": "REVENUE", "revenues": "REVENUE", "total revenue": "REVENUE",
            "net income": "NET_INCOME", "net earnings": "NET_INCOME", "profit": "NET_INCOME",
            "operating income": "OPERATING_INCOME", "ebit": "EBIT", "ebitda": "EBITDA",
            "cash flow": "CASH_FLOW", "operating cash flow": "OPERATING_CASH_FLOW",
            "free cash flow": "FREE_CASH_FLOW", "fcf": "FREE_CASH_FLOW",
            "total assets": "TOTAL_ASSETS", "total liabilities": "TOTAL_LIABILITIES",
            "equity": "SHAREHOLDERS_EQUITY", "shareholders' equity": "SHAREHOLDERS_EQUITY",
            "earnings per share": "EPS", "eps": "EPS",
        }
        self.risk_types = {
            "market risk": "MARKET_RISK", "credit risk": "CREDIT_RISK",
            "operational risk": "OPERATIONAL_RISK", "liquidity risk": "LIQUIDITY_RISK",
            "interest rate risk": "INTEREST_RATE_RISK", "currency risk": "CURRENCY_RISK",
            "foreign exchange risk": "CURRENCY_RISK", "regulatory risk": "REGULATORY_RISK",
            "compliance risk": "COMPLIANCE_RISK", "legal risk": "LEGAL_RISK",
            "reputational risk": "REPUTATIONAL_RISK", "strategic risk": "STRATEGIC_RISK",
            "cybersecurity risk": "CYBERSECURITY_RISK", "technology risk": "TECHNOLOGY_RISK",
            "political risk": "POLITICAL_RISK", "climate risk": "CLIMATE_RISK",
        }

    def _build_company_ticker_map(self):
        try:
            response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
            response.raise_for_status()
            company_data = response.json()
            mapping = {}
            for company in company_data.values():
                ticker = company['ticker']
                title = company['title']
                mapping[title] = ticker
                mapping[title.upper()] = ticker
                base_name = re.sub(r'\s+(Inc\.?|Corp\.?|Corporation|Company|Co\.?|Ltd\.?|LLC|L\.P\.)\s*,?', '', title, flags=re.IGNORECASE).strip()
                mapping[base_name] = ticker
                mapping[base_name.upper()] = ticker
                mapping[ticker] = ticker
                mapping[ticker.upper()] = ticker
            print(f"     ✓ Built company map with {len(mapping)} entries")
            return mapping
        except Exception as e:
            print(f"     ⚠ Warning: Could not build company map from SEC. Using fallback. Error: {e}")
            return {
                "Alphabet": "GOOGL", "Alphabet Inc.": "GOOGL", "Google": "GOOGL",
                "Microsoft": "MSFT", "Microsoft Corporation": "MSFT",
                "Tesla": "TSLA", "Tesla, Inc.": "TSLA",
                "Apple": "AAPL", "Apple Inc.": "AAPL",
                "Amazon": "AMZN", "Amazon.com": "AMZN", "AMZN": "AMZN",
                "Meta": "META", "Meta Platforms": "META", "Facebook": "META",
                "NVIDIA": "NVDA", "Nvidia": "NVDA", "NVDA": "NVDA",
                "AMD": "AMD", "Advanced Micro Devices": "AMD",
                "Oracle": "ORCL", "JPMorgan": "JPM",
            }

    def normalize_company(self, text):
        text_upper = text.upper()
        if text_upper in self.company_to_ticker:
            return self.company_to_ticker[text_upper]
        clean_text = re.sub(r'\s+(Inc\.?|Corp\.?|Corporation|Company|Co\.?|Ltd\.?|LLC)\s*,?', '', text, flags=re.IGNORECASE).strip().upper()
        if clean_text in self.company_to_ticker:
            return self.company_to_ticker[clean_text]
        return None

    def normalize_metric(self, text):
        return self.financial_metrics.get(text.lower().strip())

    def normalize_risk(self, text):
        return self.risk_types.get(text.lower().strip())

class FinancialEntityExtractor:
    """Enhanced entity extractor with financial domain knowledge."""
    def __init__(self, knowledge_base: FinancialKnowledgeBase):
        self.kb = knowledge_base
        print("   → Loading NER models...")
        try:
            self.spacy_nlp = spacy.load(Config.SPACY_MODEL)
            print(f"     ✓ Loaded spaCy model: {Config.SPACY_MODEL}")
        except Exception as e:
            print(f"     ✗ ERROR: Failed to load spaCy model '{Config.SPACY_MODEL}'.")
            print("     Please run: python -m spacy download en_core_web_lg")
            raise e

        try:
            self.transformer_ner = pipeline(
                "token-classification",
                model=Config.NER_MODEL,
                aggregation_strategy="simple",
                device=-1 # Use CPU (-1) for broader compatibility
            )
            print(f"     ✓ Loaded Transformer model: {Config.NER_MODEL}")
        except Exception as e:
            print(f"     ⚠ Warning: Could not load transformer model: {e}")
            self.transformer_ner = None

    def extract_entities_spacy(self, text):
        doc = self.spacy_nlp(text)
        return [{'text': ent.text, 'label': ent.label_, 'start': ent.start_char, 'end': ent.end_char, 'source': 'spacy'} for ent in doc.ents]

    def extract_entities_transformer(self, text):
        if not self.transformer_ner: return []
        try:
            # Chunking for long texts (max 512 tokens)
            max_length = 500 # Keep a buffer
            chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
            all_entities = []
            offset = 0
            for chunk in chunks:
                entities = self.transformer_ner(chunk)
                for ent in entities:
                    all_entities.append({'text': ent['word'], 'label': ent['entity_group'], 'start': offset + ent['start'], 'end': offset + ent['end'], 'score': ent['score'], 'source': 'transformer'})
                offset += len(chunk)
            return all_entities
        except Exception as e:
            print(f"     ⚠ Transformer NER error: {e}")
            return []

    def extract_financial_patterns(self, text):
        entities = []
        # Risk patterns (dynamic from KB)
        risk_keys = '|'.join(re.escape(k) for k in self.kb.risk_types.keys())
        risk_pattern = rf'({risk_keys})'
        for match in re.finditer(risk_pattern, text, re.IGNORECASE):
            entities.append({'text': match.group(1), 'label': 'RISK_MENTION', 'start': match.start(), 'end': match.end(), 'source': 'pattern'})
        # Monetary patterns
        monetary_pattern = r'(\$\s*\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d{1,3}(?:,\d{3})+(?!\d))'
        for match in re.finditer(monetary_pattern, text):
            if ',' in match.group(1) or '$' in match.group(1):
                entities.append({'text': match.group(1), 'label': 'MONETARY_VALUE', 'start': match.start(), 'end': match.end(), 'source': 'pattern'})
        return entities

    def normalize_and_link(self, entities):
        normalized = []
        seen_spans = set()
        for ent in sorted(entities, key=lambda x: x.get('score', 1.0), reverse=True):
            span = (ent['start'], ent['end'])
            if span in seen_spans: continue
            seen_spans.add(span)
            
            ent_text, ent_label = ent['text'].strip(), ent['label']
            result = {'original_text': ent_text, 'normalized_text': None, 'entity_type': ent_label, 'linked_id': None, 'confidence': ent.get('score', 1.0), 'source': ent.get('source', 'unknown'), 'span': span}

            if ent_label in ['ORG', 'ORGANIZATION']:
                ticker = self.kb.normalize_company(ent_text)
                if ticker:
                    result['normalized_text'] = ticker
                    result['entity_type'] = 'COMPANY'
                    result['linked_id'] = f"TICKER:{ticker}"
            elif ent_label == 'CARDINAL' and ',' in ent_text:
                result['entity_type'] = 'MONETARY_VALUE'
                result['normalized_text'] = ent_text.replace(',', '')
            elif ent_label in ['MONEY', 'MONETARY_VALUE']:
                result['entity_type'] = 'MONETARY_VALUE'
                numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', ent_text.replace('$', ''))
                if numbers: result['normalized_text'] = numbers[0].replace(',', '')
            elif ent_label == 'RISK_MENTION':
                risk_type = self.kb.normalize_risk(ent_text)
                if risk_type:
                    result['normalized_text'] = risk_type
                    result['entity_type'] = 'RISK_TYPE'
                    result['linked_id'] = f"RISK:{risk_type}"
            
            metric = self.kb.normalize_metric(ent_text)
            if metric:
                result['entity_type'] = 'FINANCIAL_METRIC'
                result['normalized_text'] = metric
                result['linked_id'] = f"METRIC:{metric}"
            
            normalized.append(result)
        return normalized

    def extract_and_normalize(self, text):
        spacy_entities = self.extract_entities_spacy(text)
        transformer_entities = self.extract_entities_transformer(text)
        pattern_entities = self.extract_financial_patterns(text)
        all_entities = spacy_entities + transformer_entities + pattern_entities
        normalized = self.normalize_and_link(all_entities)
        return normalized

In [7]:
# ===========================================================================
# PART 6: ENHANCED RAG QUERY ENGINE
# ===========================================================================

class EnhancedRAGEngine:
    """
    This is the new ENHANCED RAG query engine.
    It uses the NER components to automatically detect tickers
    in the question and filter the search.
    """
    def __init__(self, document_processor: DocumentProcessor, qdrant_manager: QdrantManager):
        print("\n Initializing Enhanced RAG Query Engine...")

        # 1. Get components from main pipeline
        self.embedding_model = document_processor.model
        print("   ✓ Using existing embedding model")
        self.qdrant_manager = qdrant_manager
        print("   ✓ Using existing QdrantManager for search")

        # 2. Build NER components
        self.kb = FinancialKnowledgeBase()
        self.extractor = FinancialEntityExtractor(self.kb)
        
        # 3. Initialize LLM and Prompt
        self.llm = ChatOpenAI(model=Config.LLM_MODEL, api_key=Config.OPENAI_API_KEY, temperature=0)
        template = """You are a helpful financial analyst assistant. Your role is to answer questions about SEC 10-Q filings based ONLY on the provided context.
- Base your answer strictly on the provided context from SEC filings
- Cite specific sections (e.g., "According to Item 1A...") when referencing information
- If the answer is not in the context, clearly state that

Context:
<context>
{context}
</context>

Question: {input}

Answer:"""
        self.prompt = ChatPromptTemplate.from_template(template)
        print("   ✓ Enhanced RAG Engine ready.")

    def _format_context(self, search_results: List[Dict]) -> str:
        context_str = ""
        for i, result in enumerate(search_results, 1):
            payload = result.get('payload', {})
            text = payload.get('text', 'No text found')
            item = payload.get('item', 'N/A')
            ticker = payload.get('ticker', 'N/A')
            context_str += f"Source {i} ({ticker} - {item}):\n\"{text}\"\n\n"
        return context_str.strip()

    def query(self, question: str, ticker_filter: str = None):
        print(f"\n Processing query with Enhanced Engine: '{question}'")

        # 1. Manually embed the query (same as before)
        print("   → Manually embedding query...")
        query_vector = self.embedding_model.encode(question).tolist()

        # 2. Use NER to find tickers, unless user provides a manual override.
        tickers_to_filter = []
        if ticker_filter:
            print(f"   → Using manual ticker override: {ticker_filter}")
            tickers_to_filter = [ticker_filter.upper()]
        else:
            print("   → Running NER on query to find companies...")
            normalized_entities = self.extractor.extract_and_normalize(question)
            tickers_from_ner = sorted(list(set(
                ent['normalized_text'] 
                for ent in normalized_entities 
                if ent['entity_type'] == 'COMPANY' and ent['normalized_text']
            )))
            if tickers_from_ner:
                print(f"   → NER detected and will filter for: {tickers_from_ner}")
                tickers_to_filter = tickers_from_ner
            else:
                print("   → NER found no companies, searching all documents.")
        
        # 3. Manually search Qdrant (using the new multi-ticker list)
        print("   → Manually searching Qdrant...")
        search_results = self.qdrant_manager.search(
            query_vector=query_vector,
            limit=Config.TOP_K,
            ticker_list=tickers_to_filter
        )

        if not search_results:
            return {'answer': 'No relevant context was found in the documents to answer this question.', 'sources': []}

        # 4. Manually format the prompt
        print("   → Formatting context and building prompt...")
        formatted_context = self._format_context(search_results)
        final_prompt_message = self.prompt.format_messages(context=formatted_context, input=question)

        # 5. Manually invoke the LLM
        print("   → Sending prompt to LLM...")
        llm_response = self.llm.invoke(final_prompt_message)
        answer = llm_response.content

        # 6. Format sources
        sources = [{'ticker': r['payload'].get('ticker'), 'company': r['payload'].get('company_name'), 'item': r['payload'].get('item'), 'part': r['payload'].get('part'), 'filing_date': r['payload'].get('filing_date'), 'score': r['score']} for r in search_results]

        return {'answer': answer, 'sources': sources}

In [8]:
# ===========================================================================
# PART 7: MAIN PIPELINE ORCHESTRATOR
# ===========================================================================

class SECFilingRAGPipeline:
    """Main pipeline orchestrator"""
    def __init__(self):
        print("=" * 70)
        print("ENHANCED SEC 10-Q FILING RAG SYSTEM (RAG + NER)")
        print("=" * 70)
        self.loader = SECDocumentLoader()
        self.processor = DocumentProcessor()
        self.qdrant_manager = QdrantManager()

        # Initialize the new query engine immediately
        self.query_engine = EnhancedRAGEngine(
            document_processor=self.processor,
            qdrant_manager=self.qdrant_manager
        )

    def load_and_index_filings(self, tickers: List[str] = Config.TICKERS, num_filings_per_ticker: int = 1):
        """
        MODIFIED: This function now only checks if the Golden Set exists.
        It does NOT perform indexing.
        """
        print(f"\\n{'=' * 70}")
        print(f"CHECKING DATABASE: {Config.COLLECTION_NAME}")
        print(f"{'=' * 70}")
        
        try:
            # We use num_filings_per_ticker to check if we need to index,
            # but for this test, we just check existence.
            count = self.qdrant_manager.client.count(Config.COLLECTION_NAME, exact=True)
            if count.count > 0:
                print(f"✓ Collection '{Config.COLLECTION_NAME}' found with {count.count} docs. Proceeding to query.")
            else:
                print(f"✗ ERROR: Collection '{Config.COLLECTION_NAME}' is empty or not found.")
                print("Please run '0_build_database.ipynb' first.")
                raise Exception("Database not found.")
        except Exception as e:
            print(f"✗ ERROR: Could not connect to or find collection '{Config.COLLECTION_NAME}'.")
            print("Please run '0_build_database.ipynb' first.")
            raise e
        
        print(f"{'=' * 70}\\n")

    def query(self, question: str, ticker_filter: str = None):
        """Query the indexed filings using the ENHANCED engine"""
        # The query engine is already initialized, so just use it
        result = self.query_engine.query(question, ticker_filter)

        # Print results
        print(f"\n{'=' * 70}\nANSWER\n{'=' * 70}")
        print(f"\n{result['answer']}\n")
        print(f"{'=' * 70}\nSOURCES ({len(result['sources'])} chunks)\n{'=' * 70}")
        for i, source in enumerate(result['sources'], 1):
            print(f"\n{i}. {source['company']} ({source['ticker']}) - {source['item']}")
            print(f"   Filing Date: {source['filing_date']}")
            print(f"   Relevance Score: {source['score']:.4f}")
        print(f"\n{'=' * 70}\n")
        return result

In [9]:
# ===========================================================================
# PART 8: USAGE EXAMPLE (Saving all results to one file)
# ===========================================================================

if __name__ == "__main__":

    # =======================================================================
    # 1. INITIALIZE PIPELINE
    # =======================================================================
    pipeline = SECFilingRAGPipeline()

    # =======================================================================
    # 2. LOAD AND INDEX FILINGS
    # =======================================================================
    # WARNING: This step can take a long time (10-20 minutes).
    # If you have already run this successfully, you can comment out
    # the line below to skip straight to querying.

    print("Starting the indexing process... This may take a long time.")
    pipeline.load_and_index_filings(num_filings_per_ticker=4)
    print("Indexing complete. Proceeding to queries.")

ENHANCED SEC 10-Q FILING RAG SYSTEM (RAG + NER)

 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
   ✓ Model loaded (dimension: 384)
   ✓ Initialized RecursiveCharacterTextSplitter

Connecting to Qdrant Cloud...
   ✓ Connected to Qdrant

 Initializing Enhanced RAG Query Engine...
   ✓ Using existing embedding model
   ✓ Using existing QdrantManager for search
   → Initializing Financial Knowledge Base...
     ✓ Built company map with 34397 entries
   → Loading NER models...
     ✓ Loaded spaCy model: en_core_web_lg


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


     ✓ Loaded Transformer model: dslim/bert-base-NER
   ✓ Enhanced RAG Engine ready.
Starting the indexing process... This may take a long time.
CHECKING DATABASE: sec_filings_10q_GOLDEN_BENCHMARK
✓ Collection 'sec_filings_10q_GOLDEN_BENCHMARK' found with 12807 docs. Proceeding to query.
Indexing complete. Proceeding to queries.


In [10]:
# =======================================================================
# DEFINE QUESTIONS AND SAVE ALL ANSWERS TO ONE FILE
# =======================================================================
import os
import json

# Define the output folder
DATA_FOLDER = "data"
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)
    print(f"Created data folder: {DATA_FOLDER}")

# --- Standardized Query Set ---
queries_to_run = [
    {"question": "What are the main risk factors mentioned by each companies?", "ticker_filter": None},
    {"question": "What risks did Apple disclose in their latest 10-Q?", "ticker_filter": None},
    {"question": "Compare the revenue trends of NVIDIA and AMD", "ticker_filter": None},
    {"question": "What was Tesla's R&D spending in the latest quarter?", "ticker_filter": None},
    {"question": "How has Microsoft's operating income changed over the last year?", "ticker_filter": None},
    {"question": "What was the gross profit margin for all companies?", "ticker_filter": None},
]

print(f"\\nRunning {len(queries_to_run)} queries with (Model 3) RAG + NER...")

all_results = []

for query in queries_to_run:
    print(f"\\n--- Running Query: {query['question']} ---")
    
    result = pipeline.query(
        question=query["question"],
        ticker_filter=query["ticker_filter"]
    )
    
    query_output = {
        "question": query["question"],
        "ticker_filter_manual": query["ticker_filter"],
        "response": result
    }
    all_results.append(query_output)

# --- Standardized Save Path ---
save_path = os.path.join(DATA_FOLDER, "results_model_3_ner.json")
print(f"\\n--- All queries complete. Saving all results to: {save_path} ---")

try:
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=4)
    print(f"✓ Successfully saved all answers.")
except Exception as e:
    print(f"✗ Failed to save results: {e}")

print("\\nPipeline run finished.")

\nRunning 6 queries with (Model 3) RAG + NER...
\n--- Running Query: What are the main risk factors mentioned by each companies? ---

 Processing query with Enhanced Engine: 'What are the main risk factors mentioned by each companies?'
   → Manually embedding query...
   → Running NER on query to find companies...
   → NER found no companies, searching all documents.
   → Manually searching Qdrant...
   → Formatting context and building prompt...
   → Sending prompt to LLM...

ANSWER

Based on the provided context, the main risk factors mentioned by each company are as follows:

**AMD:**
According to Item 1A, the main risk factors for AMD include:
- The highly competitive and rapidly evolving markets in which their products are sold.
- The cyclical nature of the semiconductor industry, which has experienced severe downturns.

**Apple (AAPL):**
According to Item 1A, the risk factors for Apple include:
- Various factors that can affect the company's business, reputation, results of opera