In [1]:
import os
import re
import uuid
import time
import requests
import gc
import json
import unicodedata
import pathlib
from typing import List, Dict, Tuple, Any, Optional
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString
from dataclasses import dataclass, field

# External libraries
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, CrossEncoder # Import CrossEncoder
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams, PointStruct
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# Imports for QueryProcessor
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# ===========================================================================
# PART 1: CONFIGURATION
# ===========================================================================
class Config:
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    SEC_HEADERS = {'User-Agent': 'FullAdvancedRAG research@example.com'}
    CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

    # Model Configuration
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    LLM_MODEL = "gpt-4o"
    FLAN_T5_MODEL_NAME = "google/flan-t5-small"
    SPACY_MODEL = "en_core_web_lg" # Use the large, more accurate model
    
    # --- Post-Retrieval Config ---
    CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    RETRIEVAL_TOP_K = 20 # Retrieve 20 candidates
    FINAL_TOP_K = 5      # Re-rank down to 5

    # --- Collection ---
    # !! CRITICAL: New collection name. This pipeline MUST re-index.
    COLLECTION_NAME = "sec_filings_10q_v_full" 
    VECTOR_SIZE = 384
    CHUNK_SIZE = 800
    CHUNK_OVERLAP = 200
    TICKERS = ['NVDA', 'AAPL', 'MSFT', 'AMZN', 'META', 'GOOGL', 'TSLA', 'ORCL', 'JPM', 'AMD']

In [3]:
# ===========================================================================
# PART 2: DOCUMENT LOADING
# ===========================================================================
# (Using the table-aware parser from RAG_post_retrieval.ipynb)

class SECDocumentLoader:
    @staticmethod
    def get_recent_10q_metadata(ticker: str, num_filings: int = 4) -> List[Dict[str, str]]:
        print(f"  → Fetching CIK for ticker: {ticker}...")
        response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        company_data = response.json()
        cik = None
        company_name = None
        for company in company_data.values():
            if company['ticker'] == ticker.upper():
                cik = str(company['cik_str']).zfill(10)
                company_name = company['title']
                break
        if not cik: raise ValueError(f"Ticker '{ticker}' not found")
        print(f"  → Found CIK: {cik} ({company_name})")
        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        time.sleep(0.1)
        response = requests.get(submissions_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        submissions = response.json()
        filings_metadata = []
        for i, form in enumerate(submissions['filings']['recent']['form']):
            if form == '10-Q':
                accession_number = submissions['filings']['recent']['accessionNumber'][i]
                primary_document = submissions['filings']['recent']['primaryDocument'][i]
                filing_date = submissions['filings']['recent']['filingDate'][i]
                accession_number_clean = accession_number.replace('-', '')
                filing_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number_clean}/{primary_document}"
                metadata = {'ticker': ticker.upper(), 'company_name': company_name, 'filing_date': filing_date, 'cik': cik, 'filing_url': filing_url}
                filings_metadata.append(metadata)
                if len(filings_metadata) >= num_filings: break
        if not filings_metadata: raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'")
        print(f"  → Found {len(filings_metadata)} recent 10-Q filing metadata entries.")
        return filings_metadata

    @staticmethod
    def get_filing_html(filing_url: str) -> str:
        time.sleep(0.1); response = requests.get(filing_url, headers=Config.SEC_HEADERS); response.raise_for_status(); return response.text

    @staticmethod
    def _normalize_header_text(text: str) -> str:
        text = text.strip().upper()
        part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
        if part_match: return re.sub(r'\s+', ' ', part_match.group(1))
        item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
        if item_match: return re.sub(r'\s+', ' ', item_match.group(1))
        return None

    @staticmethod
    def _parse_html_table(table_tag) -> str:
        rows = []
        for tr in table_tag.find_all('tr'):
            cells = [" ".join(cell.get_text(strip=True).split()) for cell in tr.find_all(['td', 'th'])]
            if any(cells): rows.append(cells)
        if not rows: return ""
        header = rows[0]; md = ["| " + " | ".join(header) + " |", "| " + " | ".join(['---'] * len(header)) + " |"]
        for row in rows[1:]:
            while len(row) < len(header): row.append("")
            md.append("| " + " | ".join(row[:len(header)]) + " |")
        return "\n" + "\n".join(md) + "\n"

    @classmethod
    def parse_10q(cls, html_content: str) -> Dict:
        soup = BeautifulSoup(html_content, 'lxml'); headers = []
        for h in soup.find_all(['p', 'b', 'strong', 'div']):
            text = h.get_text(strip=True)
            if len(text) > 100: continue
            key = cls._normalize_header_text(text)
            if key and not h.find_parent('a'): headers.append({'tag': h, 'key': key})
        if not headers: return {}
        data = defaultdict(lambda: defaultdict(str)); part_key = None
        for i, h_info in enumerate(headers):
            key = h_info['key']
            if 'PART' in key: part_key = key; continue
            if 'ITEM' in key:
                if not part_key: part_key = "PART I"
                start_node = h_info['tag']; end_node = headers[i + 1]['tag'] if i + 1 < len(headers) else None
                parts = []
                for elem in start_node.next_elements:
                    if elem == end_node: break
                    if isinstance(elem, NavigableString) and not elem.find_parent('table'):
                        if txt := elem.strip(): parts.append(txt)
                    elif elem.name == 'table' and not elem.find_parent('table'):
                        if md := cls._parse_html_table(elem): parts.append(md)
                data[part_key][key] = re.sub(r'\n{3,}', '\n\n', "\n".join(parts)).strip()
        return {p: dict(i) for p, i in data.items()}

In [4]:
# ===========================================================================
# PART 3: TEXT CHUNKING & EMBEDDING
# ===========================================================================
# (Using the table-aware chunker from RAG_post_retrieval.ipynb)
class DocumentProcessor:
    def __init__(self, embedding_model_name: str = Config.EMBEDDING_MODEL):
        print(f"\n Loading embedding model: {embedding_model_name}")
        self.model = SentenceTransformer(embedding_model_name)
        print(f"   ✓ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})")
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=Config.CHUNK_SIZE, chunk_overlap=Config.CHUNK_OVERLAP, length_function=len, add_start_index=False)
        print(f"   ✓ Initialized RecursiveCharacterTextSplitter")
        
    def generate_document_chunks(self, parsed_data: Dict, metadata: Dict, embed_batch_size: int = 1024):
        all_docs = []
        for part, items in parsed_data.items():
            for item, content in items.items():
                if not content: continue
                # --- NEW: Add table detection ---
                is_table = False
                if content.strip().startswith("|") and "---" in content.strip().split('\n')[1]:
                    is_table = True
                # ---
                doc_metadata = {'ticker': metadata['ticker'], 'company_name': metadata['company_name'], 'filing_date': metadata['filing_date'], 'filing_url': metadata['filing_url'], 'part': part, 'item': item, 'is_table': is_table} # Add is_table
                all_docs.append(Document(page_content=content, metadata=doc_metadata))
        if not all_docs: return
        print(f"     → Splitting {len(all_docs)} high-level 'Items' into smaller chunks...")
        chunked_docs = self.text_splitter.split_documents(all_docs)
        print(f"     → Generated {len(chunked_docs)} chunks")
        text_batch, metadata_batch = [], []
        for chunk in chunked_docs:
            text_batch.append(chunk.page_content); metadata_batch.append(chunk.metadata)
            if len(text_batch) >= embed_batch_size:
                embeddings = self.model.encode(text_batch, show_progress_bar=False)
                for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                    yield PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={'text': txt, **meta})
                text_batch, metadata_batch = [], []
        if text_batch:
            embeddings = self.model.encode(text_batch, show_progress_bar=False)
            for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                yield PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload={'text': txt, **meta})

In [7]:
# ===========================================================================
# PART 4: QDRANT VECTOR DATABASE
# ===========================================================================
# (With 'is_table' index and multi-filter search)
class QdrantManager:
    def __init__(self):
        print(f"\nConnecting to Qdrant Cloud...")
        self.client = QdrantClient(url=Config.QDRANT_URL, api_key=Config.QDRANT_API_KEY)
        print(f"   ✓ Connected to Qdrant")
        
    def create_collection(self, collection_name: str = Config.COLLECTION_NAME, vector_size: int = Config.VECTOR_SIZE):
        print(f"\n Setting up collection: {collection_name}")
        collections = self.client.get_collections().collections
        exists = any(col.name == collection_name for col in collections)
        if exists:
            print(f"   ⚠ Collection '{collection_name}' exists, recreating...")
            self.client.delete_collection(collection_name)
            
        self.client.create_collection(collection_name=collection_name, vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE))
        print(f"   ✓ Collection created")
        
        for field in ["ticker", "item"]:
            print(f"   → Creating payload index for '{field}' (Keyword)...")
            self.client.create_payload_index(collection_name=collection_name, field_name=field, field_schema=models.PayloadSchemaType.KEYWORD)
            
        print(f"   → Creating payload index for 'is_table' (Boolean)...")
        self.client.create_payload_index(collection_name=collection_name, field_name="is_table", field_schema=models.PayloadSchemaType.BOOL)
        print(f"   ✓ Payload indexes created.")

    def upsert_documents(self, points_generator, collection_name: str = Config.COLLECTION_NAME, batch_size: int = 2048) -> int:
        print(f" Uploading chunks to Qdrant in batches of {batch_size}..."); batch, count = [], 0
        for point in points_generator:
            batch.append(point)
            if len(batch) >= batch_size:
                self.client.upsert(collection_name=collection_name, points=batch, wait=False); count += len(batch); print(f"     → Uploaded {count} chunks..."); batch = []
        if batch: self.client.upsert(collection_name=collection_name, points=batch, wait=False); count += len(batch)
        print(f"  ✓ All chunks uploaded. Total: {count}"); return count

    def search(self, query_vector: List[float], collection_name: str = Config.COLLECTION_NAME, limit: int = Config.RETRIEVAL_TOP_K, filter_dict: Dict = None) -> List[Dict]:
        qdrant_filter = None
        if filter_dict:
            must_conditions = []
            for key, value in filter_dict.items():
                if isinstance(value, list): must_conditions.append(models.FieldCondition(key=key, match=models.MatchAny(any=value)))
                elif isinstance(value, bool): must_conditions.append(models.FieldCondition(key=key, match=models.MatchValue(value=value)))
                else: must_conditions.append(models.FieldCondition(key=key, match=models.MatchValue(value=value)))
            qdrant_filter = models.Filter(must=must_conditions)
            
        results = self.client.search(collection_name=collection_name, query_vector=query_vector, limit=limit, query_filter=qdrant_filter, with_payload=True)
        return [{'score': result.score, 'payload': result.payload} for result in results]

In [8]:
# ===========================================================================
# PART 5: QUERY PROCESSOR MODULE
# ===========================================================================
# (Full QueryProcessor logic from query_processors.ipynb, with the BUG FIX)

USE_SBERT = True
USE_FLAN_T5 = True

def normalize(text: str) -> str:
    text = unicodedata.normalize("NFKC", text).strip()
    return re.sub(r"\s+", " ", text)

TOKEN_RE = re.compile(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?|[&$%.\-]+")
def simple_tokenize(text: str) -> List[str]:
    toks = TOKEN_RE.findall(text); cleaned = []
    for t in toks:
        if t.lower() == "'s": continue
        if t.endswith("'s"): t = t[:-2]
        cleaned.append(t)
    return cleaned

DOMAIN_SYNONYMS = {"risk": ["risk factor","risk factors","uncertainty","exposure","threat"], "cyber": ["cybersecurity","information security","infosec","data breach","security incident"], "performance": ["revenue","growth","margin","profit","loss","guidance","results"], "strategy": ["roadmap","plan","initiative","expansion","capex","restructuring","acquisition"], "md&a": ["management discussion","md&a","results of operations"],}
COMPANY_TICKERS = {"tesla":"TSLA", "apple":"AAPL", "microsoft":"MSFT", "nvidia":"NVDA", "google":"GOOGL", "alphabet":"GOOGL", "meta":"META", "amazon":"AMZN", "amd":"AMD", "oracle":"ORCL", "jpmorgan":"JPM", "jpm":"JPM"}

def keyword_expand(tokens: List[str]) -> List[str]:
    ex = []; t0s = [t.strip(".-").lower() for t in tokens]
    for t0 in t0s: ex.extend(DOMAIN_SYNONYMS.get(t0, []))
    seen, out = set(), [];
    for w in ex:
        if w not in seen: seen.add(w); out.append(w)
    return out

def build_keywords(tokens: List[str], expansions: List[str]) -> List[str]:
    kept = []
    for t in tokens + expansions:
        t = t.lower()
        if re.search(r"[a-z0-9]", t) and t not in kept: kept.append(t)
    return kept

try:
    _nlp = spacy.load(Config.SPACY_MODEL)
    print("✓ Loaded spaCy model for NER")
except Exception as e:
    print(f"✗ Failed to load spaCy model '{Config.SPACY_MODEL}'. NER will be limited.")
    print("  Run: python -m spacy download en_core_web_lg")
    _nlp = None

def extract_entities(raw_text: str) -> dict:
    out = {}; low = raw_text.lower()
    q = re.findall(r"\b(q[1-4])\s*([12][0-9]{3})\b", low)
    if q: out["quarter"] = [f"{p.upper()} {y}" for p, y in q]
    years = re.findall(r"\b(20[0-4][0-9]|19[0-9]{2})\b", raw_text)
    if years: out["year"] = sorted(set(years))
    
    companies = set()
    if _nlp is not None:
        doc = _nlp(raw_text)
        for ent in doc.ents:
            if ent.label_ == "ORG": companies.add(ent.text.strip())
            
    # --- THIS IS THE BUG FIX ---
    low_raw = raw_text.lower() 
    # --- END OF BUG FIX ---
    
    for name in COMPANY_TICKERS:
        if name in low_raw: companies.add(name.title()) # 'low_raw' is now defined
        
    if companies: out["company"] = sorted(companies)
    
    tickers = set(COMPANY_TICKERS.get(c.lower(),"") for c in companies if COMPANY_TICKERS.get(c.lower()))
    tickers.update(re.findall(r"\$([A-Z]{1,5})\b", raw_text))
    tickers.update(re.findall(r"\(([A-Z]{1,5})\)", raw_text))
    tickers.update(re.findall(r"\b(?:NASDAQ|NYSE)\s*:\s*([A-Z]{1,5})\b", raw_text))
    tickers = {t for t in tickers if t}
    if tickers: out["ticker"] = sorted(tickers)
    return out

try:
    _sbert = SentenceTransformer(Config.EMBEDDING_MODEL) if USE_SBERT else None
    print(f"✓ Loaded SentenceTransformer model '{Config.EMBEDDING_MODEL}'")
except Exception: _sbert = None; print(f"✗ Failed to load SentenceTransformer")

def sbert_embed(text: str) -> Optional[List[float]]:
    if _sbert is None: return None
    v = _sbert.encode([text], normalize_embeddings=True)[0]
    return v.tolist()

try:
    _flan_device = "cuda" if USE_FLAN_T5 and torch.cuda.is_available() else "cpu"
    _flan_tok = AutoTokenizer.from_pretrained(Config.FLAN_T5_MODEL_NAME) if USE_FLAN_T5 else None
    _flan_mdl = AutoModelForSeq2SeqLM.from_pretrained(Config.FLAN_T5_MODEL_NAME).to(_flan_device).eval() if USE_FLAN_T5 else None
    print(f"✓ Loaded Flan-T5 model '{Config.FLAN_T5_MODEL_NAME}' on {_flan_device}")
except Exception as e:
    _flan_tok = _flan_mdl = None; _flan_device = "cpu"; print(f"✗ Failed to load Flan-T5 model: {e}")

def t5_paraphrases_safe(q: str, num_return: int = 3, max_new_tokens: int = 48) -> List[str]:
    if not (USE_FLAN_T5 and _flan_tok is not None and _flan_mdl is not None): return []
    prompt = f"Rewrite the query into multiple short paraphrases without adding facts or numbers. Keep meaning.\nQuery: {q}"
    x = _flan_tok(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    x = {k: v.to(_flan_device) for k, v in x.items()}
    with torch.no_grad():
        out = _flan_mdl.generate(**x, do_sample=True, top_k=50, top_p=0.92, temperature=0.9, num_return_sequences=num_return, max_new_tokens=max_new_tokens, repetition_penalty=1.1, no_repeat_ngram_size=3)
    paras = _flan_tok.batch_decode(out, skip_special_tokens=True)
    base = re.sub(r"\W+"," ", q).strip().lower()
    seen, kept = set(), []
    for p in paras:
        p2 = normalize(p); p2_cmp = re.sub(r"\W+"," ", p2).strip().lower()
        if p2_cmp == base: continue
        if p2 and p2 not in seen: seen.add(p2); kept.append(p2)
    return kept[:num_return]

INTENT_LABELS = ["risk","performance","strategy"]
X_train = ["What new risk factors?", "Cybersecurity breach Tesla", "Explain Apple revenue growth", "Compare Microsoft profit guidance", "Outline Nvidia expansion strategy", "What restructuring plan?"]
y_train = ["risk","risk","performance","performance","strategy","strategy"]
_intent_clf = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1)), ("lr", LogisticRegression(max_iter=300, class_weight="balanced", multi_class="ovr"))]).fit(X_train, y_train)
RISK_KW = {"risk","risk factor","risk factors","uncertainty","cyber","cybersecurity","breach","litigation","security"}
PERF_KW = {"revenue","growth","margin","profit","loss","guidance","results","compare","last quarter","quarterly"}
STRAT_KW= {"strategy","plan","roadmap","expansion","acquisition","restructuring","capex","data center","data centers"}
def _kw_score(t: str, kws: set[str]) -> int: return sum(1 for k in kws if k in t)
def classify_intent(text: str) -> Tuple[str, float]:
    tx = normalize(text); proba = _intent_clf.predict_proba([tx])[0].tolist()
    k_r = _kw_score(tx, RISK_KW); k_p = _kw_score(tx, PERF_KW); k_s = _kw_score(tx, STRAT_KW)
    k_sum = max(1, (k_r + k_p + k_s)); priors = [k_r/k_sum, k_p/k_sum, k_s/k_sum]
    alpha, beta = 0.6, 0.4; blended = [alpha*proba[i] + beta*priors[i] for i in range(3)]
    s = sum(blended) or 1.0; blended = [b/s for b in blended]
    idx = max(range(3), key=lambda i: blended[i])
    return INTENT_LABELS[idx], float(blended[idx])

def expand_query(query: str) -> dict:
    norm = normalize(query); toks = simple_tokenize(norm); lex_ex = keyword_expand(toks)
    paras = t5_paraphrases_safe(norm, num_return=3, max_new_tokens=48) if USE_FLAN_T5 else []
    para_tokens = list(dict.fromkeys(t for p in paras for t in simple_tokenize(p)))
    para_ex = keyword_expand(para_tokens) if para_tokens else []
    expansions = list(dict.fromkeys(w for lst in (lex_ex, para_ex) for w in lst))
    return {"normalized": norm, "tokens": toks, "expansions": expansions, "paraphrases": paras, "keywords": build_keywords(toks, expansions)}

@dataclass
class QueryProcessorConfig:
    labels: List[str] = field(default_factory=lambda: ["risk","performance","strategy"])
class QueryProcessor:
    def __init__(self, config: QueryProcessorConfig = QueryProcessorConfig()):
        self.config = config; print("✓ QueryProcessor initialized")
    def process(self, query: str) -> Dict[str, Any]:
        raw = query; ex = expand_query(query); ents = extract_entities(raw)
        label, conf = classify_intent(ex["normalized"])
        emb = sbert_embed(ex["normalized"]) if USE_SBERT else None
        return {"normalized": ex["normalized"], "label": label, "confidence": conf, "expansions": ex["expansions"], "paraphrases": ex["paraphrases"], "keywords": ex["keywords"], "entities": ents, "filters": ents.copy(), "embedding": emb}

✓ Loaded spaCy model for NER
✓ Loaded SentenceTransformer model 'sentence-transformers/all-MiniLM-L6-v2'
✓ Loaded Flan-T5 model 'google/flan-t5-small' on cpu


In [9]:
# ===========================================================================
# PART 6: RAG QUERY ENGINE (MODEL 5)
# ===========================================================================
# This is the new, combined engine.
class FullAdvancedRAGEngine:
    def __init__(self, document_processor: DocumentProcessor, 
                 qdrant_manager: QdrantManager, 
                 query_processor: QueryProcessor):
        print("\n Initializing Full Advanced RAG Engine (Model 5)...")
        
        # --- All Components ---
        self.embedding_model = document_processor.model
        self.qdrant_manager = qdrant_manager
        self.query_processor = query_processor
        print("   ✓ Processor, QdrantManager, and QueryProcessor attached.")

        # --- Post-Retrieval ---
        print(f"   → Loading Cross-Encoder: {Config.CROSS_ENCODER_MODEL}...")
        self.cross_encoder = CrossEncoder(Config.CROSS_ENCODER_MODEL)
        print("   ✓ Cross-Encoder model loaded.")
        
        # --- LLM & Prompts ---
        self.llm = ChatOpenAI(model=Config.LLM_MODEL, api_key=Config.OPENAI_API_KEY, temperature=0)
        print("   ✓ Initialized ChatOpenAI LLM")
        
        # Generation Prompt
        template = """You are a helpful financial analyst assistant. Your role is to answer questions about SEC 10-Q filings based ONLY on the provided context.
- Base your answer strictly on the provided context from SEC filings
- Cite specific sections (e.g., "According to Item 1A...") when referencing information
- If the answer is not in the context, clearly state that

Context:
<context>
{context}
</context>

Question: {input}

Answer:"""
        self.prompt = ChatPromptTemplate.from_template(template)
        
        # Router Prompt (from RAG_post_retrieval)
        router_template = """Your job is to classify a user's question.
Does the question ask for a specific numerical value, metric, or data from a financial table?
(e.g., "What was revenue?", "How much was R&D spending?", "Show me the cash flow")

Answer only with 'yes' or 'no'.

Question: {question}
Answer:"""
        self.table_query_router_prompt = ChatPromptTemplate.from_template(router_template)
        self.query_router_chain = self.table_query_router_prompt | self.llm | StrOutputParser()
        print("   ✓ Full Advanced RAG Engine ready.")

    def _format_context(self, search_results: List[Dict]) -> str:
        context_str = ""
        for i, result in enumerate(search_results, 1):
            payload = result.get('payload', {})
            is_table = payload.get('is_table', False)
            table_tag = "[TABLE DATA] " if is_table else ""
            context_str += f"Source {i} ({payload.get('ticker','N/A')} - {payload.get('item','N/A')}) {table_tag}:\n\"{payload.get('text','No text')}\"\n\n"
        return context_str.strip()

    def query(self, question: str, ticker_filter: str = None):
        print(f"\n Processing query with Full Advanced Engine: '{question}'")
        
        # 1. --- PRE-RETRIEVAL + NER (QueryProcessor) ---
        print("   → Step 1: Processing query (NER, Intent, Embedding)...")
        q_obj = self.query_processor.process(question)
        
        query_vector = q_obj["embedding"]
        if query_vector is None:
            print("   → Warning: QP embedding failed, using base model embedding.")
            query_vector = self.embedding_model.encode(question).tolist()

        # 2. --- FILTER GENERATION (NER + ROUTER) ---
        print("   → Step 2: Generating filters...")
        final_filter_dict = {}

        # a) Get NER filters (tickers)
        if "ticker" in q_obj["filters"]:
            final_filter_dict["ticker"] = q_obj["filters"]["ticker"]
            print(f"   → NER auto-detected filter: {final_filter_dict}")
        
        # b) Get Table Router filter
        print("   → Routing for table data...")
        try:
            result = self.query_router_chain.invoke({"question": question})
            if 'yes' in result.lower():
                print("   → Query routed to TABLE search.")
                final_filter_dict["is_table"] = True
            else:
                print("   → Query routed to GENERAL text search.")
        except Exception as e:
            print(f"   ⚠ Error in query routing: {e}. Defaulting to general search.")
        
        # c) Apply manual override
        if ticker_filter:
            print(f"   → MANUAL OVERRIDE: Applying filter: {ticker_filter}")
            # Override all filters if a manual one is given
            final_filter_dict = {"ticker": ticker_filter.upper()} 
        
        if not final_filter_dict:
            print("   → No filters applied. Searching all documents.")

        # 3. --- RETRIEVAL ---
        print(f"   → Step 3: Searching Qdrant (Retrieving Top {Config.RETRIEVAL_TOP_K})...")
        search_results = self.qdrant_manager.search(
            query_vector=query_vector,
            limit=Config.RETRIEVAL_TOP_K, # Retrieve 20
            filter_dict=final_filter_dict if final_filter_dict else None
        )
        if not search_results:
            return {'answer': 'No relevant context found.', 'sources': []}
        print(f"   → Retrieved {len(search_results)} candidates.")

        # 4. --- POST-RETRIEVAL (Re-ranking) ---
        print(f"   → Step 4: Re-ranking candidates with Cross-Encoder...")
        passages = [result['payload']['text'] for result in search_results]
        query_passage_pairs = [(question, passage) for passage in passages]
        
        cross_encoder_scores = self.cross_encoder.predict(query_passage_pairs)
        
        scored_results = list(zip(cross_encoder_scores, search_results))
        scored_results.sort(key=lambda x: x[0], reverse=True) # Sort by new score
        
        # Get the final Top-K results
        final_sources = []
        for (score, result) in scored_results[:Config.FINAL_TOP_K]:
            source_data = result['payload']
            source_data['retrieval_score'] = result['score'] # Original vector score
            source_data['rerank_score'] = float(score) # New cross-encoder score
            final_sources.append(source_data)
        print(f"   → Re-ranked. Final {len(final_sources)} sources selected.")

        # 5. --- GENERATION ---
        print("   → Step 5: Formatting context and sending to LLM...")
        context_results = [{'score': data['rerank_score'], 'payload': data} for data in final_sources]
        formatted_context = self._format_context(context_results)
        final_prompt_message = self.prompt.format_messages(context=formatted_context, input=question)

        llm_response = self.llm.invoke(final_prompt_message)
        answer = llm_response.content

        # 6. --- FORMAT OUTPUT ---
        sources_output = []
        for i, source_data in enumerate(final_sources, 1):
            sources_output.append({
                'ticker': source_data.get('ticker'),
                'company': source_data.get('company_name'),
                'item': source_data.get('item'),
                'is_table': source_data.get('is_table'),
                'filing_date': source_data.get('filing_date'),
                'rerank_score': source_data['rerank_score'],
                'retrieval_score': source_data['retrieval_score']
            })
        return {'answer': answer, 'sources': sources_output}

In [10]:
# ===========================================================================
# PART 7: MAIN PIPELINE ORCHESTRATOR
# ===========================================================================
class SECFilingRAGPipeline:
    def __init__(self):
        print("=" * 70); print("SEC 10-Q RAG SYSTEM (Model 5: Full Advanced)"); print("=" * 70)
        self.loader = SECDocumentLoader()
        self.processor = DocumentProcessor()
        self.qdrant_manager = QdrantManager()
        self.query_processor = QueryProcessor()
        self.query_engine = None

    def load_and_index_filings(self, tickers: List[str] = Config.TICKERS, num_filings_per_ticker: int = 4):
        print(f"\n{'=' * 70}\nLOADING & INDEXING PHASE\n{'=' * 70}")
        print(f"\nProcessing {len(tickers)} companies: {', '.join(tickers)}")
        print(f"(Fetching {num_filings_per_ticker} filings per company)\n")
        
        # This pipeline *MUST* index into a new collection
        self.qdrant_manager.create_collection()
        
        # Check if it's already indexed (e.g., if script was interrupted)
        try:
            count = self.qdrant_manager.client.count(Config.COLLECTION_NAME, exact=True)
            if count.count > 0:
                print(f"Collection '{Config.COLLECTION_NAME}' already has {count.count} docs. Skipping indexing.")
                print(f"{'=' * 70}\n"); return
        except Exception: 
            print("Collection is new. Proceeding with indexing.")

        successful_tickers, failed_tickers = [], []; total_chunks_indexed = 0
        for idx, ticker in enumerate( tickers, 1):
            print(f"\n[{idx}/{len(tickers)}] Processing {ticker}"); print("-" * 70)
            ticker_chunks_count, num_filings_processed = 0, 0
            try:
                filings_metadata_list = self.loader.get_recent_10q_metadata(ticker, num_filings=num_filings_per_ticker)
                for filing_metadata in filings_metadata_list:
                    try:
                        print(f"  → Downloading filing from: {filing_metadata['filing_date']}..."); html_content = self.loader.get_filing_html(filing_metadata['filing_url'])
                        print(f"  → Parsing 10-Q structure..."); parsed_data = self.loader.parse_10q(html_content)
                        if not parsed_data: print(f"  ⚠ Warning: No data parsed for {ticker}"); del html_content; gc.collect(); continue
                        print(f"  → Creating chunks and embeddings generator...")
                        chunks_generator = self.processor.generate_document_chunks(parsed_data, filing_metadata)
                        num_uploaded = self.qdrant_manager.upsert_documents(chunks_generator)
                        if num_uploaded > 0:
                            ticker_chunks_count += num_uploaded; total_chunks_indexed += num_uploaded; num_filings_processed += 1
                        print(f"  → Cleaning up memory..."); del html_content; del parsed_data; del chunks_generator; gc.collect(); print(f"  ✓ Memory cleaned.")
                    except Exception as e: print(f"  ✗ Error processing filing for {ticker}: {e}"); gc.collect()
                if ticker_chunks_count > 0: successful_tickers.append(ticker); print(f"  ✓ Finished {ticker}. Chunks: {ticker_chunks_count}")
                else: failed_tickers.append(ticker); print(f"  ⚠ No chunks created for {ticker}")
            except Exception as e: print(f"  ✗ Error processing {ticker}: {e}"); failed_tickers.append(ticker)
            if idx < len(tickers): time.sleep(0.2)
        print(f"\n{'=' * 70}\nINDEXING COMPLETE\n{'=' * 70}")
        print(f"✓ Success: {len(successful_tickers)} companies ({', '.join(successful_tickers)})")
        if failed_tickers: print(f"✗ Failed: {len(failed_tickers)} companies ({', '.join(failed_tickers)})")
        print(f"\n Total chunks indexed: {total_chunks_indexed}\n{'=' * 70}\n")

    def query(self, question: str, ticker_filter: str = None):
        if self.query_engine is None:
            self.query_engine = FullAdvancedRAGEngine(
                document_processor=self.processor,
                qdrant_manager=self.qdrant_manager,
                query_processor=self.query_processor
            )
        result = self.query_engine.query(question, ticker_filter)
        print(f"\n{'=' * 70}\nANSWER\n{'=' * 70}\n\n{result['answer']}\n")
        print(f"{'=' * 70}\nSOURCES ({len(result['sources'])})\n{'=' * 70}")
        for i, source in enumerate(result['sources'], 1):
            print(f"\n{i}. {source['company']} ({source['ticker']}) - {source['item']}")
            print(f"   Filing Date: {source['filing_date']}")
            print(f"   Is Table: {source.get('is_table')}")
            print(f"   Re-Rank Score: {source.get('rerank_score', 'N/A'):.4f}")
            print(f"   Vector Score: {source.get('retrieval_score', 'N/A'):.4f}")
        print(f"\n{'=' * 70}\n")
        return result

In [11]:
# ===========================================================================
# PART 8: USAGE EXAMPLE (Saving all results)
# ===========================================================================
if __name__ == "__main__":
    pipeline = SECFilingRAGPipeline()
    
    # NOTE: This uses a new collection 'sec_filings_10q_v_full'
    # You MUST run this indexing step one time.
    print("Checking if indexing is needed for the 'v_full' collection...")
    pipeline.load_and_index_filings(num_filings_per_ticker=4)
    print("Indexing check complete. Proceeding to queries.")

SEC 10-Q RAG SYSTEM (Model 5: Full Advanced)

 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
   ✓ Model loaded (dimension: 384)
   ✓ Initialized RecursiveCharacterTextSplitter

Connecting to Qdrant Cloud...
   ✓ Connected to Qdrant
✓ QueryProcessor initialized
Checking if indexing is needed for the 'v_full' collection...

LOADING & INDEXING PHASE

Processing 10 companies: NVDA, AAPL, MSFT, AMZN, META, GOOGL, TSLA, ORCL, JPM, AMD
(Fetching 4 filings per company)


 Setting up collection: sec_filings_10q_v_full
   ✓ Collection created
   → Creating payload index for 'ticker' (Keyword)...
   → Creating payload index for 'item' (Keyword)...
   → Creating payload index for 'is_table' (Boolean)...
   ✓ Payload indexes created.

[1/10] Processing NVDA
----------------------------------------------------------------------
  → Fetching CIK for ticker: NVDA...
  → Found CIK: 0001045810 (NVIDIA CORP)
  → Found 4 recent 10-Q filing metadata entries.
  → Downloading filing from: 2

In [12]:
DATA_FOLDER = "data"
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER); print(f"Created data folder: {DATA_FOLDER}")

queries_to_run = [
    {"question": "What are the main risk factors mentioned by each companies?", "ticker_filter": None},
    {"question": "What risks did Apple disclose in their latest 10-Q?", "ticker_filter": None},
    {"question": "Compare the revenue trends of NVIDIA and AMD", "ticker_filter": None},
    {"question": "What was Tesla's R&D spending in the latest quarter?", "ticker_filter": None},
    {"question": "How has Microsoft's operating income changed over the last year?", "ticker_filter": None},
    {"question": "What was the gross profit margin for all companies?", "ticker_filter": None},
]

print(f"\nRunning {len(queries_to_run)} queries with (Model 5) Full Advanced RAG...")
all_results = []
for query in queries_to_run:
    print(f"\n--- Running Query: {query['question']} ---")
    result = pipeline.query(question=query["question"], ticker_filter=query["ticker_filter"])
    all_results.append({"question": query["question"], "ticker_filter_manual": query["ticker_filter"], "response": result})

save_path = os.path.join(DATA_FOLDER, "RAG_full_pipeline_results.json")
print(f"\n--- All queries complete. Saving all results to: {save_path} ---")
try:
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=4)
    print(f"✓ Successfully saved all answers.")
except Exception as e: print(f"✗ Failed to save results: {e}")
print("\nPipeline run finished.")


Running 6 queries with (Model 5) Full Advanced RAG...

--- Running Query: What are the main risk factors mentioned by each companies? ---

 Initializing Full Advanced RAG Engine (Model 5)...
   ✓ Processor, QdrantManager, and QueryProcessor attached.
   → Loading Cross-Encoder: cross-encoder/ms-marco-MiniLM-L-6-v2...
   ✓ Cross-Encoder model loaded.
   ✓ Initialized ChatOpenAI LLM
   ✓ Full Advanced RAG Engine ready.

 Processing query with Full Advanced Engine: 'What are the main risk factors mentioned by each companies?'
   → Step 1: Processing query (NER, Intent, Embedding)...
   → Step 2: Generating filters...
   → Routing for table data...
   → Query routed to GENERAL text search.
   → No filters applied. Searching all documents.
   → Step 3: Searching Qdrant (Retrieving Top 20)...
   → Retrieved 20 candidates.
   → Step 4: Re-ranking candidates with Cross-Encoder...
   → Re-ranked. Final 5 sources selected.
   → Step 5: Formatting context and sending to LLM...

ANSWER

According