In [17]:
import os
import re
import uuid
import time
import requests
import gc
import json
import unicodedata
import pathlib
from typing import List, Dict, Tuple, Any, Optional
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString
from dataclasses import dataclass, field

# External libraries
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams, PointStruct
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Imports for QueryProcessor
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

True

In [18]:
# ===========================================================================
# PART 1: CONFIGURATION & SETUP
# ===========================================================================
# (From base_RAG.ipynb)

class Config:
    """Configuration class for API keys and model settings"""
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    SEC_HEADERS = {'User-Agent': 'EnhancedRAG-Project research@example.com'}
    CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

    # Model Configuration
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dimensions
    LLM_MODEL = "gpt-4o"
    FLAN_T5_MODEL_NAME = "google/flan-t5-small"
    SPACY_MODEL = "en_core_web_sm" # Use the small, fast model

    # Collection Configuration
    # We use the same collection as base_RAG, so no re-indexing is needed
    COLLECTION_NAME = "sec_filings_10q" 
    VECTOR_SIZE = 384
    CHUNK_SIZE = 800
    CHUNK_OVERLAP = 200
    TOP_K = 5
    TICKERS = ['NVDA', 'AAPL', 'MSFT', 'AMZN', 'META', 'GOOGL', 'TSLA', 'ORCL', 'JPM', 'AMD']

In [19]:
# ===========================================================================
# PART 2: DOCUMENT LOADING
# ===========================================================================
# (From base_RAG.ipynb)

class SECDocumentLoader:
    """Handles fetching and parsing of SEC 10-Q filings"""
    @staticmethod
    def get_recent_10q_metadata(ticker: str, num_filings: int = 4) -> List[Dict[str, str]]:
        print(f"  → Fetching CIK for ticker: {ticker}...")
        response = requests.get(Config.CIK_MAP_URL, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        company_data = response.json()

        cik = None
        company_name = None
        for company in company_data.values():
            if company['ticker'] == ticker.upper():
                cik = str(company['cik_str']).zfill(10)
                company_name = company['title']
                break
        if not cik:
            raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")
        print(f"  → Found CIK: {cik} ({company_name})")

        submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        time.sleep(0.1)
        response = requests.get(submissions_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        submissions = response.json()

        filings_metadata = []
        for i, form in enumerate(submissions['filings']['recent']['form']):
            if form == '10-Q':
                accession_number = submissions['filings']['recent']['accessionNumber'][i]
                primary_document = submissions['filings']['recent']['primaryDocument'][i]
                filing_date = submissions['filings']['recent']['filingDate'][i]
                accession_number_clean = accession_number.replace('-', '')
                filing_url = (
                    f"https://www.sec.gov/Archives/edgar/data/{cik}/"
                    f"{accession_number_clean}/{primary_document}"
                )
                metadata = {
                    'ticker': ticker.upper(),
                    'company_name': company_name,
                    'filing_date': filing_date,
                    'cik': cik,
                    'filing_url': filing_url
                }
                filings_metadata.append(metadata)
                if len(filings_metadata) >= num_filings:
                    break
        if not filings_metadata:
            raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")
        print(f"  → Found {len(filings_metadata)} recent 10-Q filing metadata entries.")
        return filings_metadata

    @staticmethod
    def get_filing_html(filing_url: str) -> str:
        time.sleep(0.1)
        response = requests.get(filing_url, headers=Config.SEC_HEADERS)
        response.raise_for_status()
        return response.text

    @staticmethod
    def _normalize_header_text(text: str) -> str:
        text = text.strip().upper()
        part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
        if part_match:
            return re.sub(r'\s+', ' ', part_match.group(1))
        item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
        if item_match:
            return re.sub(r'\s+', ' ', item_match.group(1))
        return None

    @staticmethod
    def _parse_html_table(table_tag) -> str:
        markdown_rows = []
        for tr in table_tag.find_all('tr'):
            cells = [" ".join(cell.get_text(strip=True).split())
                    for cell in tr.find_all(['td', 'th'])]
            if any(cells):
                markdown_rows.append(cells)
        if not markdown_rows: return ""
        md_output = []
        header = markdown_rows[0]
        md_output.append("| " + " | ".join(header) + " |")
        md_output.append("| " + " | ".join(['---'] * len(header)) + " |")
        for row in markdown_rows[1:]:
            while len(row) < len(header): row.append("")
            row = row[:len(header)]
            md_output.append("| " + " | ".join(row) + " |")
        return "\n" + "\n".join(md_output) + "\n"

    @classmethod
    def parse_10q(cls, html_content: str) -> Dict:
        soup = BeautifulSoup(html_content, 'lxml')
        potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])
        doc_headers = []
        for header in potential_headers:
            text = header.get_text(strip=True)
            if len(text) > 100: continue
            normalized_key = cls._normalize_header_text(text)
            if normalized_key and not header.find_parent('a'):
                doc_headers.append({'tag': header, 'key': normalized_key})
        if not doc_headers: return {}
        parsed_data = defaultdict(lambda: defaultdict(str))
        current_part_key = None
        for i, header_info in enumerate(doc_headers):
            current_key = header_info['key']
            if 'PART' in current_key:
                current_part_key = current_key
                continue
            if 'ITEM' in current_key:
                if not current_part_key: current_part_key = "PART I"
                start_node = header_info['tag']
                end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None
                content_parts = []
                element = start_node.next_element
                while element and element != end_node:
                    if isinstance(element, NavigableString):
                        if not element.find_parent('table'):
                            text = element.strip()
                            if text: content_parts.append(text)
                    elif element.name == 'table':
                        if not element.find_parent('table'):
                            table_markdown = cls._parse_html_table(element)
                            if table_markdown: content_parts.append(table_markdown)
                    element = element.next_element
                full_content = "\n".join(content_parts)
                clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()
                parsed_data[current_part_key][current_key] = clean_content
        return {part: dict(items) for part, items in parsed_data.items()}

In [20]:
# ===========================================================================
# PART 3: TEXT CHUNKING & EMBEDDING
# ===========================================================================
# (From base_RAG.ipynb)

class DocumentProcessor:
    def __init__(self, embedding_model_name: str = Config.EMBEDDING_MODEL):
        print(f"\n Loading embedding model: {embedding_model_name}")
        self.model = SentenceTransformer(embedding_model_name)
        print(f"   ✓ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})")
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            length_function=len,
            add_start_index=False,
        )
        print(f"   ✓ Initialized RecursiveCharacterTextSplitter")

    def generate_document_chunks(self, parsed_data: Dict, metadata: Dict,
                                 embed_batch_size: int = 1024):
        all_docs = []
        for part, items in parsed_data.items():
            for item, content in items.items():
                if not content: continue
                doc_metadata = {
                    'ticker': metadata['ticker'],
                    'company_name': metadata['company_name'],
                    'filing_date': metadata['filing_date'],
                    'filing_url': metadata['filing_url'],
                    'part': part,
                    'item': item
                }
                doc = Document(page_content=content, metadata=doc_metadata)
                all_docs.append(doc)
        if not all_docs: return
        print(f"     → Splitting {len(all_docs)} high-level 'Items' into smaller chunks...")
        chunked_docs = self.text_splitter.split_documents(all_docs)
        print(f"     → Generated {len(chunked_docs)} chunks")
        text_batch, metadata_batch = [], []
        for chunk in chunked_docs:
            text_batch.append(chunk.page_content)
            metadata_batch.append(chunk.metadata)
            if len(text_batch) >= embed_batch_size:
                embeddings = self.model.encode(text_batch, show_progress_bar=False)
                for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                    payload = {'text': txt, **meta}
                    yield PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload=payload)
                text_batch, metadata_batch = [], []
        if text_batch:
            embeddings = self.model.encode(text_batch, show_progress_bar=False)
            for txt, emb, meta in zip(text_batch, embeddings, metadata_batch):
                payload = {'text': txt, **meta}
                yield PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload=payload)

In [21]:
# ===========================================================================
# PART 4: QDRANT VECTOR DATABASE
# ===========================================================================
# (From base_RAG.ipynb)

class QdrantManager:
    def __init__(self):
        print(f"\nConnecting to Qdrant Cloud...")
        self.client = QdrantClient(url=Config.QDRANT_URL, api_key=Config.QDRANT_API_KEY)
        print(f"   ✓ Connected to Qdrant")

    def create_collection(self, collection_name: str = Config.COLLECTION_NAME,
                         vector_size: int = Config.VECTOR_SIZE):
        print(f"\n Setting up collection: {collection_name}")
        collections = self.client.get_collections().collections
        exists = any(col.name == collection_name for col in collections)
        if exists:
            # If it exists, we assume it's already indexed and skip recreation
            print(f"   ✓ Collection '{collection_name}' already exists. Skipping creation.")
            return
        
        self.client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
        )
        print(f"   ✓ Collection created")
        print(f"   → Creating payload index for 'ticker'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="ticker",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   → Creating payload index for 'item'...")
        self.client.create_payload_index(
            collection_name=collection_name,
            field_name="item",
            field_schema=models.PayloadSchemaType.KEYWORD
        )
        print(f"   ✓ Payload indexes created.")

    def upsert_documents(self, points_generator,
                        collection_name: str = Config.COLLECTION_NAME,
                        batch_size: int = 2048) -> int:
        print(f" Uploading chunks to Qdrant in batches of {batch_size}...")
        batch, count = [], 0
        for point in points_generator:
            batch.append(point)
            if len(batch) >= batch_size:
                self.client.upsert(collection_name=collection_name, points=batch, wait=False)
                count += len(batch)
                print(f"     → Uploaded {count} chunks so far...")
                batch = []
        if batch:
            self.client.upsert(collection_name=collection_name, points=batch, wait=False)
            count += len(batch)
        print(f"  ✓ All chunks uploaded for this document. Total: {count}")
        return count

    def search(self, query_vector: List[float],
              collection_name: str = Config.COLLECTION_NAME,
              limit: int = Config.TOP_K,
              filter_dict: Dict = None) -> List[Dict]:
        
        qdrant_filter = None
        if filter_dict:
            # MODIFIED: Allow multi-value filters (e.g., list of tickers)
            must_conditions = []
            for key, value in filter_dict.items():
                if isinstance(value, list):
                    must_conditions.append(models.FieldCondition(
                        key=key, 
                        match=models.MatchAny(any=value)
                    ))
                else:
                    must_conditions.append(models.FieldCondition(
                        key=key, 
                        match=models.MatchValue(value=value)
                    ))
            qdrant_filter = models.Filter(must=must_conditions)

        results = self.client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=limit,
            query_filter=qdrant_filter,
            with_payload=True
        )
        return [{'score': result.score, 'payload': result.payload} for result in results]

In [22]:
# ===========================================================================
# PART 5: QUERY PROCESSOR MODULE
# ===========================================================================
# (Full QueryProcessor logic from query_processors.ipynb)

USE_SBERT = True
USE_FLAN_T5 = True

def normalize(text: str) -> str:
    text = unicodedata.normalize("NFKC", text).strip()
    return re.sub(r"\s+", " ", text)

TOKEN_RE = re.compile(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?|[&$%.\-]+")
def simple_tokenize(text: str) -> List[str]:
    toks = TOKEN_RE.findall(text)
    cleaned = []
    for t in toks:
        if t.lower() == "'s": continue
        if t.endswith("'s"): t = t[:-2]
        cleaned.append(t)
    return cleaned

DOMAIN_SYNONYMS = {
    "risk": ["risk factor","risk factors","uncertainty","exposure","threat"],
    "cyber": ["cybersecurity","information security","infosec","data breach","security incident"],
    "performance": ["revenue","growth","margin","profit","loss","guidance","results"],
    "strategy": ["roadmap","plan","initiative","expansion","capex","restructuring","acquisition"],
    "md&a": ["management discussion","md&a","results of operations"],
}

COMPANY_TICKERS = {"tesla":"TSLA", "apple":"AAPL", "microsoft":"MSFT", "nvidia":"NVDA", "google":"GOOGL", "alphabet":"GOOGL", "meta":"META", "amazon":"AMZN", "amd":"AMD", "oracle":"ORCL", "jpmorgan":"JPM", "jpm":"JPM"}

def keyword_expand(tokens: List[str]) -> List[str]:
    ex = []
    for t in tokens:
        t0 = t.strip(".-").lower()
        ex.extend(DOMAIN_SYNONYMS.get(t0, []))
    seen, out = set(), []
    for w in ex:
        if w not in seen: seen.add(w); out.append(w)
    return out

def build_keywords(tokens: List[str], expansions: List[str]) -> List[str]:
    kept = []
    for t in tokens + expansions:
        t = t.lower()
        if re.search(r"[a-z0-9]", t) and t not in kept:
            kept.append(t)
    return kept

try:
    _nlp = spacy.load(Config.SPACY_MODEL)
    print("✓ Loaded spaCy model for NER")
except Exception as e:
    print(f"✗ Failed to load spaCy model '{Config.SPACY_MODEL}'. NER will be limited.")
    print("  Run: python -m spacy download en_core_web_lg")
    _nlp = None

def extract_entities(raw_text: str) -> dict:
    out = {}; low = raw_text.lower()
    q = re.findall(r"\b(q[1-4])\s*([12][0-9]{3})\b", low)
    if q: out["quarter"] = [f"{p.upper()} {y}" for p, y in q]
    years = re.findall(r"\b(20[0-4][0-9]|19[0-9]{2})\b", raw_text)
    if years: out["year"] = sorted(set(years))
    companies = set()
    if _nlp is not None:
        doc = _nlp(raw_text)
        for ent in doc.ents:
            if ent.label_ == "ORG": companies.add(ent.text.strip())
    low_raw = raw_text.lower()
    for name in COMPANY_TICKERS:
        if name in low_raw: companies.add(name.title())
    if companies: out["company"] = sorted(companies)
    tickers = set(COMPANY_TICKERS.get(c.lower(),"") for c in companies if COMPANY_TICKERS.get(c.lower()))
    tickers.update(re.findall(r"\$([A-Z]{1,5})\b", raw_text))
    tickers.update(re.findall(r"\(([A-Z]{1,5})\)", raw_text))
    tickers.update(re.findall(r"\b(?:NASDAQ|NYSE)\s*:\s*([A-Z]{1,5})\b", raw_text))
    tickers = {t for t in tickers if t}
    if tickers: out["ticker"] = sorted(tickers)
    return out

try:
    _sbert = SentenceTransformer(Config.EMBEDDING_MODEL) if USE_SBERT else None
    print(f"✓ Loaded SentenceTransformer model '{Config.EMBEDDING_MODEL}'")
except Exception: _sbert = None; print(f"✗ Failed to load SentenceTransformer")

def sbert_embed(text: str) -> Optional[List[float]]:
    if _sbert is None: return None
    v = _sbert.encode([text], normalize_embeddings=True)[0]
    return v.tolist()

try:
    _flan_device = "cuda" if USE_FLAN_T5 and torch.cuda.is_available() else "cpu"
    _flan_tok = AutoTokenizer.from_pretrained(Config.FLAN_T5_MODEL_NAME) if USE_FLAN_T5 else None
    _flan_mdl = AutoModelForSeq2SeqLM.from_pretrained(Config.FLAN_T5_MODEL_NAME).to(_flan_device).eval() if USE_FLAN_T5 else None
    print(f"✓ Loaded Flan-T5 model '{Config.FLAN_T5_MODEL_NAME}' on {_flan_device}")
except Exception as e:
    _flan_tok = _flan_mdl = None; _flan_device = "cpu"; print(f"✗ Failed to load Flan-T5 model: {e}")

def t5_paraphrases_safe(q: str, num_return: int = 3, max_new_tokens: int = 48) -> List[str]:
    if not (USE_FLAN_T5 and _flan_tok is not None and _flan_mdl is not None): return []
    prompt = f"Rewrite the query into multiple short paraphrases without adding facts or numbers. Keep meaning.\nQuery: {q}"
    x = _flan_tok(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    x = {k: v.to(_flan_device) for k, v in x.items()}
    with torch.no_grad():
        out = _flan_mdl.generate(**x, do_sample=True, top_k=50, top_p=0.92, temperature=0.9, num_return_sequences=num_return, max_new_tokens=max_new_tokens, repetition_penalty=1.1, no_repeat_ngram_size=3)
    paras = _flan_tok.batch_decode(out, skip_special_tokens=True)
    base = re.sub(r"\W+"," ", q).strip().lower()
    seen, kept = set(), []
    for p in paras:
        p2 = normalize(p); p2_cmp = re.sub(r"\W+"," ", p2).strip().lower()
        if p2_cmp == base: continue
        if p2 and p2 not in seen: seen.add(p2); kept.append(p2)
    return kept[:num_return]

INTENT_LABELS = ["risk","performance","strategy"]
X_train = ["What new risk factors?", "Cybersecurity breach Tesla", "Explain Apple revenue growth", "Compare Microsoft profit guidance", "Outline Nvidia expansion strategy", "What restructuring plan?"]
y_train = ["risk","risk","performance","performance","strategy","strategy"]
_intent_clf = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1)), ("lr", LogisticRegression(max_iter=300, class_weight="balanced", multi_class="ovr"))]).fit(X_train, y_train)
RISK_KW = {"risk","risk factor","risk factors","uncertainty","cyber","cybersecurity","breach","litigation","security"}
PERF_KW = {"revenue","growth","margin","profit","loss","guidance","results","compare","last quarter","quarterly"}
STRAT_KW= {"strategy","plan","roadmap","expansion","acquisition","restructuring","capex","data center","data centers"}
def _kw_score(t: str, kws: set[str]) -> int: return sum(1 for k in kws if k in t)
def classify_intent(text: str) -> Tuple[str, float]:
    tx = normalize(text); proba = _intent_clf.predict_proba([tx])[0].tolist()
    k_r = _kw_score(tx, RISK_KW); k_p = _kw_score(tx, PERF_KW); k_s = _kw_score(tx, STRAT_KW)
    k_sum = max(1, (k_r + k_p + k_s)); priors = [k_r/k_sum, k_p/k_sum, k_s/k_sum]
    alpha, beta = 0.6, 0.4; blended = [alpha*proba[i] + beta*priors[i] for i in range(3)]
    s = sum(blended) or 1.0; blended = [b/s for b in blended]
    idx = max(range(3), key=lambda i: blended[i])
    return INTENT_LABELS[idx], float(blended[idx])

def expand_query(query: str) -> dict:
    norm = normalize(query); toks = simple_tokenize(norm); lex_ex = keyword_expand(toks)
    paras = t5_paraphrases_safe(norm, num_return=3, max_new_tokens=48) if USE_FLAN_T5 else []
    para_tokens = list(dict.fromkeys(t for p in paras for t in simple_tokenize(p)))
    para_ex = keyword_expand(para_tokens) if para_tokens else []
    expansions = list(dict.fromkeys(w for lst in (lex_ex, para_ex) for w in lst))
    return {"normalized": norm, "tokens": toks, "expansions": expansions, "paraphrases": paras, "keywords": build_keywords(toks, expansions)}

@dataclass
class QueryProcessorConfig:
    labels: List[str] = field(default_factory=lambda: ["risk","performance","strategy"])

class QueryProcessor:
    def __init__(self, config: QueryProcessorConfig = QueryProcessorConfig()):
        self.config = config; print("✓ QueryProcessor initialized")
    def process(self, query: str) -> Dict[str, Any]:
        raw = query; ex = expand_query(query); ents = extract_entities(raw)
        label, conf = classify_intent(ex["normalized"])
        emb = sbert_embed(ex["normalized"]) if USE_SBERT else None
        return {"normalized": ex["normalized"], "label": label, "confidence": conf, "expansions": ex["expansions"], "paraphrases": ex["paraphrases"], "keywords": ex["keywords"], "entities": ents, "filters": ents.copy(), "embedding": emb}

✓ Loaded spaCy model for NER
✓ Loaded SentenceTransformer model 'sentence-transformers/all-MiniLM-L6-v2'
✓ Loaded Flan-T5 model 'google/flan-t5-small' on cpu


In [23]:
# ===========================================================================
# PART 6: RAG QUERY ENGINE (MODIFIED)
# ===========================================================================
# (From base_RAG.ipynb, modified to use QueryProcessor)

class ManualRAGEngine:
    def __init__(self, document_processor: DocumentProcessor, 
                 qdrant_manager: QdrantManager, 
                 query_processor: QueryProcessor):
        print("\n Initializing Manual RAG Engine (with Query Processor)...")
        
        # 1. Get components
        self.embedding_model = document_processor.model
        print("   ✓ Using existing embedding model from DocumentProcessor")
        self.qdrant_manager = qdrant_manager
        print("   ✓ Using existing QdrantManager for search")
        self.query_processor = query_processor
        print("   ✓ QueryProcessor attached")

        # 2. Initialize LLM
        self.llm = ChatOpenAI(model=Config.LLM_MODEL, api_key=Config.OPENAI_API_KEY, temperature=0)
        print("   ✓ Initialized ChatOpenAI LLM")

        # 3. Create prompt template
        template = """You are a helpful financial analyst assistant. Your role is to answer questions about SEC 10-Q filings based ONLY on the provided context.
- Base your answer strictly on the provided context from SEC filings
- Cite specific sections (e.g., "According to Item 1A...") when referencing information
- If the answer is not in the context, clearly state that

Context:
<context>
{context}
</context>

Question: {input}

Answer:"""
        self.prompt = ChatPromptTemplate.from_template(template)
        print("   ✓ Manual RAG Engine ready.")

    def _format_context(self, search_results: List[Dict]) -> str:
        context_str = ""
        for i, result in enumerate(search_results, 1):
            payload = result.get('payload', {})
            text = payload.get('text', 'No text found')
            item = payload.get('item', 'N/A')
            ticker = payload.get('ticker', 'N/A')
            context_str += f"Source {i} ({ticker} - {item}):\n\"{text}\"\n\n"
        return context_str.strip()

    def query(self, question: str, ticker_filter: str = None):
        print(f"\n Processing query with Manual Engine: '{question}'")
        
        # 1. --- NEW: Use QueryProcessor ---
        print("   → Processing query (NER, Intent, Embedding)...")
        q_obj = self.query_processor.process(question)
        
        query_vector = q_obj["embedding"]
        filter_dict = {}

        # Use filters generated by the processor
        if "ticker" in q_obj["filters"]:
            filter_dict["ticker"] = q_obj["filters"]["ticker"]
            print(f"   → Auto-detected filter: {filter_dict}")
        
        # --- Allow manual override ---
        if ticker_filter:
            print(f"   → MANUAL OVERRIDE: Applying filter: {ticker_filter}")
            filter_dict = {"ticker": ticker_filter.upper()}
        
        if not filter_dict:
            print("   → No filters applied. Searching all documents.")

        # 2. Manually search Qdrant
        print("   → Manually searching Qdrant...")
        search_results = self.qdrant_manager.search(
            query_vector=query_vector,
            limit=Config.TOP_K,
            filter_dict=filter_dict if filter_dict else None
        )

        if not search_results:
            return {'answer': 'No relevant context was found in the documents to answer this question.', 'sources': []}

        # 3. Format prompt
        print("   → Formatting context and building prompt...")
        formatted_context = self._format_context(search_results)
        final_prompt_message = self.prompt.format_messages(
            context=formatted_context,
            input=question
        )

        # 4. Invoke LLM
        print("   → Sending prompt to LLM...")
        llm_response = self.llm.invoke(final_prompt_message)
        answer = llm_response.content

        # 5. Format sources
        sources = []
        for i, result in enumerate(search_results, 1):
            sources.append({
                'ticker': result['payload'].get('ticker'),
                'company': result['payload'].get('company_name'),
                'item': result['payload'].get('item'),
                'part': result['payload'].get('part'),
                'filing_date': result['payload'].get('filing_date'),
                'score': result['score']
            })

        return {'answer': answer, 'sources': sources}

In [24]:
# ===========================================================================
# PART 6: RAG QUERY ENGINE (MODEL 4)
# ===========================================================================
# This engine is modified to *only* use Pre-Retrieval.
# It uses the QueryProcessor for its embedding, but *not* for filtering.

class PreRetrievalRAGEngine:
    def __init__(self, document_processor: DocumentProcessor, 
                 qdrant_manager: QdrantManager, 
                 query_processor: QueryProcessor):
        print("\n Initializing Pre-Retrieval RAG Engine (Model 4)...")
        self.embedding_model = document_processor.model
        print("   ✓ Using existing embedding model (for fallback)")
        self.qdrant_manager = qdrant_manager
        print("   ✓ Using existing QdrantManager for search")
        self.query_processor = query_processor
        print("   ✓ QueryProcessor attached (for embedding generation)")

        self.llm = ChatOpenAI(model=Config.LLM_MODEL, api_key=Config.OPENAI_API_KEY, temperature=0)
        print("   ✓ Initialized ChatOpenAI LLM")
        template = """You are a helpful financial analyst assistant...
Context:
<context>
{context}
</context>
Question: {input}
Answer:"""
        self.prompt = ChatPromptTemplate.from_template(template.replace("...","").strip())
        print("   ✓ Pre-Retrieval RAG Engine ready.")

    def _format_context(self, search_results: List[Dict]) -> str:
        context_str = ""
        for i, result in enumerate(search_results, 1):
            payload = result.get('payload', {})
            context_str += f"Source {i} ({payload.get('ticker','N/A')} - {payload.get('item','N/A')}):\n\"{payload.get('text','No text')}\"\n\n"
        return context_str.strip()

    def query(self, question: str, ticker_filter: str = None):
        print(f"\n Processing query with Pre-Retrieval Engine: '{question}'")
        
        # 1. --- Use QueryProcessor for PRE-RETRIEVAL ---
        print("   → Processing query (Normalization, Expansion, Embedding)...")
        q_obj = self.query_processor.process(question)
        
        # This is the "Pre-Retrieval" step: using the processed embedding
        query_vector = q_obj["embedding"]
        if query_vector is None:
            print("   → Warning: QP embedding failed, using base model embedding.")
            query_vector = self.embedding_model.encode(question).tolist()

        # 2. --- FILTERING IS DISABLED FOR THIS MODEL ---
        # We ignore q_obj["filters"] to isolate the Pre-Retrieval effect.
        filter_dict = None
        if ticker_filter:
            print(f"   → MANUAL OVERRIDE: Applying filter: {ticker_filter}")
            filter_dict = {"ticker": ticker_filter.upper()}
        else:
            print("   → No filters applied (NER filtering is OFF for this model).")

        # 3. Manually search Qdrant
        print("   → Manually searching Qdrant...")
        search_results = self.qdrant_manager.search(
            query_vector=query_vector,
            limit=Config.TOP_K,
            filter_dict=filter_dict
        )
        if not search_results:
            return {'answer': 'No relevant context found.', 'sources': []}

        # 4. Format prompt
        print("   → Formatting context and building prompt...")
        formatted_context = self._format_context(search_results)
        final_prompt_message = self.prompt.format_messages(context=formatted_context, input=question)

        # 5. Invoke LLM
        print("   → Sending prompt to LLM...")
        llm_response = self.llm.invoke(final_prompt_message)
        answer = llm_response.content

        # 6. Format sources
        sources = [{'ticker': r['payload'].get('ticker'), 'company': r['payload'].get('company_name'), 'item': r['payload'].get('item'), 'part': r['payload'].get('part'), 'filing_date': r['payload'].get('filing_date'), 'score': r['score']} for r in search_results]
        return {'answer': answer, 'sources': sources}

In [25]:
# ===========================================================================
# PART 7: MAIN PIPELINE ORCHESTRATOR
# ===========================================================================
class SECFilingRAGPipeline:
    def __init__(self):
        print("=" * 70); print("SEC 10-Q RAG SYSTEM (Model 4: Pre-Retrieval)"); print("=" * 70)
        self.loader = SECDocumentLoader()
        self.processor = DocumentProcessor()
        self.qdrant_manager = QdrantManager()
        self.query_processor = QueryProcessor()
        self.query_engine = None

    def load_and_index_filings(self, tickers: List[str] = Config.TICKERS, num_filings_per_ticker: int = 4):
        print(f"\n{'=' * 70}\nLOADING & INDEXING PHASE\n{'=' * 70}")
        print(f"\nProcessing {len(tickers)} companies: {', '.join(tickers)}")
        print(f"(Fetching {num_filings_per_ticker} filings per company)\n")
        
        self.qdrant_manager.create_collection() # Will skip if collection exists
        
        try: # Check if collection is empty
            count = self.qdrant_manager.client.count(Config.COLLECTION_NAME, exact=True)
            if count.count > 0:
                print(f"Collection '{Config.COLLECTION_NAME}' already has {count.count} docs. Skipping indexing.")
                print(f"{'=' * 70}\n"); return
        except Exception: pass # Collection might not exist, proceed.

        successful_tickers, failed_tickers = [], []; total_chunks_indexed = 0
        for idx, ticker in enumerate( tickers, 1):
            print(f"\n[{idx}/{len(tickers)}] Processing {ticker}"); print("-" * 70)
            ticker_chunks_count, num_filings_processed = 0, 0
            try:
                filings_metadata_list = self.loader.get_recent_10q_metadata(ticker, num_filings=num_filings_per_ticker)
                for filing_metadata in filings_metadata_list:
                    try:
                        print(f"  → Downloading filing from: {filing_metadata['filing_date']}..."); html_content = self.loader.get_filing_html(filing_metadata['filing_url'])
                        print(f"  → Parsing 10-Q structure..."); parsed_data = self.loader.parse_10q(html_content)
                        if not parsed_data: print(f"  ⚠ Warning: No data parsed for {ticker}"); del html_content; gc.collect(); continue
                        print(f"  → Creating chunks and embeddings generator...")
                        chunks_generator = self.processor.generate_document_chunks(parsed_data, filing_metadata)
                        num_uploaded = self.qdrant_manager.upsert_documents(chunks_generator)
                        if num_uploaded > 0:
                            ticker_chunks_count += num_uploaded; total_chunks_indexed += num_uploaded; num_filings_processed += 1
                        print(f"  → Cleaning up memory..."); del html_content; del parsed_data; del chunks_generator; gc.collect(); print(f"  ✓ Memory cleaned.")
                    except Exception as e: print(f"  ✗ Error processing filing for {ticker}: {e}"); gc.collect()
                if ticker_chunks_count > 0: successful_tickers.append(ticker); print(f"  ✓ Finished {ticker}. Chunks: {ticker_chunks_count}")
                else: failed_tickers.append(ticker); print(f"  ⚠ No chunks created for {ticker}")
            except Exception as e: print(f"  ✗ Error processing {ticker}: {e}"); failed_tickers.append(ticker)
            if idx < len(tickers): time.sleep(0.2)
        print(f"\n{'=' * 70}\nINDEXING COMPLETE\n{'=' * 70}")
        print(f"✓ Success: {len(successful_tickers)} companies ({', '.join(successful_tickers)})")
        if failed_tickers: print(f"✗ Failed: {len(failed_tickers)} companies ({', '.join(failed_tickers)})")
        print(f"\n Total chunks indexed: {total_chunks_indexed}\n{'=' * 70}\n")

    def query(self, question: str, ticker_filter: str = None):
        if self.query_engine is None:
            self.query_engine = PreRetrievalRAGEngine(
                document_processor=self.processor,
                qdrant_manager=self.qdrant_manager,
                query_processor=self.query_processor
            )
        result = self.query_engine.query(question, ticker_filter)
        print(f"\n{'=' * 70}\nANSWER\n{'=' * 70}\n\n{result['answer']}\n")
        print(f"{'=' * 70}\nSOURCES ({len(result['sources'])})\n{'=' * 70}")
        for i, source in enumerate(result['sources'], 1):
            print(f"\n{i}. {source['company']} ({source['ticker']}) - {source['item']}")
            print(f"   Filing Date: {source['filing_date']}")
            print(f"   Relevance Score: {source['score']:.4f}")
        print(f"\n{'=' * 70}\n")
        return result

In [26]:
# ===========================================================================
# PART 8: USAGE EXAMPLE (Saving all results to one file)
# ===========================================================================

if __name__ == "__main__":

    # =======================================================================
    # 1. INITIALIZE PIPELINE
    # =======================================================================
    pipeline = SECFilingRAGPipeline()

    # =======================================================================
    # 2. LOAD AND INDEX FILINGS
    # =======================================================================
    # This will check if the 'sec_filings_10q' collection has data.
    # If you already ran base_RAG.ipynb, it will skip re-indexing.
    
    print("Checking if indexing is needed...")
    pipeline.load_and_index_filings(num_filings_per_ticker=4)
    print("Indexing check complete. Proceeding to queries.")

SEC 10-Q RAG SYSTEM (Model 4: Pre-Retrieval)

 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


   ✓ Model loaded (dimension: 384)
   ✓ Initialized RecursiveCharacterTextSplitter

Connecting to Qdrant Cloud...
   ✓ Connected to Qdrant
✓ QueryProcessor initialized
Checking if indexing is needed...

LOADING & INDEXING PHASE

Processing 10 companies: NVDA, AAPL, MSFT, AMZN, META, GOOGL, TSLA, ORCL, JPM, AMD
(Fetching 4 filings per company)


 Setting up collection: sec_filings_10q
   ✓ Collection 'sec_filings_10q' already exists. Skipping creation.
Collection 'sec_filings_10q' already has 9109 docs. Skipping indexing.

Indexing check complete. Proceeding to queries.


In [28]:
DATA_FOLDER = "data"
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER); print(f"Created data folder: {DATA_FOLDER}")

queries_to_run = [
    {"question": "What are the main risk factors mentioned by each companies?", "ticker_filter": None},
    {"question": "What risks did Apple disclose in their latest 10-Q?", "ticker_filter": None},
    {"question": "Compare the revenue trends of NVIDIA and AMD", "ticker_filter": None},
    {"question": "What was Tesla's R&D spending in the latest quarter?", "ticker_filter": None},
    {"question": "How has Microsoft's operating income changed over the last year?", "ticker_filter": None},
    {"question": "What was the gross profit margin for all companies?", "ticker_filter": None},
]

print(f"\nRunning {len(queries_to_run)} queries with (Model 4) Pre-Retrieval RAG...")
all_results = []
for query in queries_to_run:
    print(f"\n--- Running Query: {query['question']} ---")
    result = pipeline.query(question=query["question"], ticker_filter=query["ticker_filter"])
    all_results.append({"question": query["question"], "ticker_filter_manual": query["ticker_filter"], "response": result})

save_path = os.path.join(DATA_FOLDER, "RAG_query_processing_results.json")
print(f"\n--- All queries complete. Saving all results to: {save_path} ---")
try:
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=4)
    print(f"✓ Successfully saved all answers.")
except Exception as e: print(f"✗ Failed to save results: {e}")
print("\nPipeline run finished.")


Running 6 queries with (Model 4) Pre-Retrieval RAG...

--- Running Query: What are the main risk factors mentioned by each companies? ---

 Processing query with Pre-Retrieval Engine: 'What are the main risk factors mentioned by each companies?'
   → Processing query (Normalization, Expansion, Embedding)...
   → No filters applied (NER filtering is OFF for this model).
   → Manually searching Qdrant...
   → Formatting context and building prompt...
   → Sending prompt to LLM...

ANSWER

Based on the provided context, the main risk factors mentioned by each company are as follows:

**AMD (Advanced Micro Devices):**
1. The markets in which AMD's products are sold are highly competitive and rapidly evolving.
2. The semiconductor industry is highly cyclical and has experienced severe downturns.
3. Intel Corporation's dominance of the microprocessor market and its aggressive business practices may limit AMD's ability to compete effectively on a level playing field.

**AAPL (Apple Inc.):**
