## API Test

In [3]:
import sys
import subprocess
import time

# Upgrade pip first
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
except subprocess.CalledProcessError as e:
    print("Error upgrading pip:", e)

# Required packages for this code snippet
packages = ["pandas", "openpyxl", "requests"]

errors = []
start_time = time.time()

for pkg in packages:
    try:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", pkg])
    except subprocess.CalledProcessError as e:
        errors.append(f"{pkg}: {e}")
        print(f"Failed to install {pkg}. Continuing.")

elapsed_time = time.time() - start_time

print("\nInstallation completed.")
if errors:
    print("Encountered the following errors:")
    for err in errors:
        print(" -", err)
else:
    print("All packages installed successfully.")

print(f"Elapsed time: {elapsed_time:.2f} seconds.")


Installing pandas ...
Installing openpyxl ...
Installing requests ...

Installation completed.
All packages installed successfully.
Elapsed time: 6.53 seconds.


Data collection


In [6]:
import sys
import subprocess
import os
import json
import shutil
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

def install_if_needed(package):
    """Utility function to ensure a package is installed."""
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", package])

# Ensure required packages for your environment
for pkg in ["pandas", "openpyxl", "requests", "beautifulsoup4", "tqdm"]:
    install_if_needed(pkg)

# Headers for requests (SEC recommends a User-Agent with your email)
HEADERS = {
    "User-Agent": "RAGFinancialReport/1.0 (your_email@example.com)"
}

# Set TEST_MODE = True to only process one ticker for one year (2023)
TEST_MODE = True

def get_all_tickers_info():
    """
    Fetch the SEC-provided JSON mapping of all known tickers to their CIKs.
    Returns a dict of the form:
      { '0': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'}, ... }
    """
    url = "https://www.sec.gov/files/company_tickers.json"
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print("Error fetching tickers info:", e)
        return {}

def find_cik(ticker, tickers_data):
    """
    Look up the CIK for a given ticker using the data from get_all_tickers_info().
    Returns the CIK (as an integer) if found, else None.
    """
    try:
        for _, info in tickers_data.items():
            if info["ticker"].lower() == ticker.lower():
                return info["cik_str"]
    except Exception as e:
        print("Error finding CIK for", ticker, ":", e)
    return None

def extract_text_from_ixbrl(html_soup):
    """
    Extract text from Inline XBRL <ix:nonNumeric> and <ix:nonFraction> tags.
    Return a concatenated string of all text found.
    """
    text_segments = []

    # Gather text from <ix:nonNumeric> tags
    non_numeric_tags = html_soup.find_all(lambda tag: tag.name and "nonNumeric" in tag.name)
    for tag in non_numeric_tags:
        # Some tags might contain child tags or might be empty
        txt = tag.get_text(strip=True)
        if txt:
            text_segments.append(txt)

    # Gather text from <ix:nonFraction> tags
    non_fraction_tags = html_soup.find_all(lambda tag: tag.name and "nonFraction" in tag.name)
    for tag in non_fraction_tags:
        txt = tag.get_text(strip=True)
        if txt:
            text_segments.append(txt)

    # Join everything with a couple newlines
    return "\n\n".join(text_segments)

def is_ixbrl_document(html_text):
    """
    Heuristic check: If we see <ix: in the HTML text, it's likely an Inline XBRL doc.
    """
    return "<ix:" in html_text or "<ix:" in html_text.lower()

def fetch_10k_text_and_link_from_index(index_url):
    """
    Given the '-index.htm' URL for a particular filing:
      1) Parse the table (tableFile) to find the row where Type == '10-K' or '10-K/A'.
      2) Return the actual 10-K document's URL and download its text (handle iXBRL).
      3) Optionally, gather 'GRAPHIC' documents.
    
    Returns a dict:
    {
       "tenk_link": <URL to the actual 10-K doc>,
       "tenk_text": <the full text extracted>,
       "graphics": [
          {"filename": "...", "graphic_url": "...", "local_path": ""},
          ...
       ]
    }
    """
    result = {
        "tenk_link": None,
        "tenk_text": "",
        "graphics": []
    }
    try:
        # 1) Get the index page
        resp = requests.get(index_url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # 2) Find table rows that contain the docs
        rows = soup.select("table.tableFile tr")
        tenk_link = None

        for row in rows:
            cells = row.find_all("td")
            if len(cells) < 4:
                continue
            # 'Type' is typically in cells[3]
            doc_type = cells[3].get_text(strip=True)
            link_tag = cells[2].find("a")
            if link_tag:
                href = link_tag.get("href", "")
                if href.startswith("/"):
                    href = "https://www.sec.gov" + href

                # Identify the main 10-K document
                if doc_type in ["10-K", "10-K/A"]:
                    tenk_link = href

                # Identify graphics files
                if doc_type == "GRAPHIC":
                    filename = cells[2].get_text(strip=True)
                    result["graphics"].append({
                        "filename": filename,
                        "graphic_url": href,
                        "local_path": ""  # to be filled if downloaded
                    })

        if not tenk_link:
            print(f"Could not find a 10-K doc link on index page: {index_url}")
            return result

        # 3) Download the actual 10-K text
        r2 = requests.get(tenk_link, headers=HEADERS, timeout=20)
        r2.raise_for_status()

        # Check if this is an Inline XBRL doc
        text_html = r2.text
        text_soup = BeautifulSoup(text_html, "html.parser")

        if is_ixbrl_document(text_html):
            # Parse iXBRL tags
            full_text = extract_text_from_ixbrl(text_soup)
        else:
            # Fallback: Just get all text
            full_text = text_soup.get_text(separator="\n")

        result["tenk_link"] = tenk_link
        result["tenk_text"] = full_text

        return result

    except Exception as e:
        print("Error fetching full 10-K text:", e)
        return result

def chunk_text_by_paragraphs(text, max_chars=2000):
    """
    Splits text into chunks no larger than max_chars by paragraphs.
    """
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = []
    current_length = 0

    for para in paragraphs:
        para_len = len(para) + 1  # +1 for newline or spacing
        if current_length + para_len > max_chars:
            # flush current_chunk
            chunks.append("\n".join(current_chunk))
            current_chunk = [para]
            current_length = para_len
        else:
            current_chunk.append(para)
            current_length += para_len

    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

def build_knowledge_graph(chunks):
    """
    Builds a simple knowledge graph from text chunks.
    Each chunk becomes a node and nodes are sequentially connected.
    
    Returns a dict with 'nodes' and 'edges'.
    """
    nodes = []
    edges = []
    for i, chunk in enumerate(chunks):
        node_id = f"chunk_{i+1}"
        nodes.append({
            "id": node_id,
            "type": "text_chunk",
            "content": chunk.strip()
        })
        if i > 0:
            edges.append({
                "source": f"chunk_{i}",
                "target": node_id,
                "relationship": "next"
            })
    return {"nodes": nodes, "edges": edges}

def fetch_10k_filings_for_year(cik, year):
    """
    Uses the SEC submissions JSON endpoint for the given CIK,
    filters for form == '10-K' (or '10-K/A') with a filing date in the given year,
    and fetches the 10-K text and links.
    
    Returns a list of dicts containing:
    {
       "form_type": ...,
       "filing_date": ...,
       "accession_no": ...,
       "doc_index_url": ...,
       "tenk_link": ...,
       "tenk_text": ...,
       "graphics": [...],
    }
    """
    base_url = f"https://data.sec.gov/submissions/CIK{str(cik).zfill(10)}.json"
    results = []
    try:
        r = requests.get(base_url, headers=HEADERS, timeout=10)
        if r.status_code != 200:
            print(f"Failed to fetch data for CIK {cik} in {year}: status code {r.status_code}")
            return results
        data = r.json()
    except Exception as e:
        print(f"Error fetching data for CIK {cik} in {year}:", e)
        return results

    try:
        recent = data.get("filings", {}).get("recent", {})
        forms = recent.get("form", [])
        dates = recent.get("filingDate", [])
        accessions = recent.get("accessionNumber", [])

        for i in range(len(forms)):
            form_type = forms[i]
            filing_date = dates[i]
            accession_no = accessions[i]

            if form_type in ["10-K", "10-K/A"] and filing_date.startswith(str(year)):
                doc_index_url = (
                    f"https://www.sec.gov/Archives/edgar/data/"
                    f"{int(cik)}/{accession_no.replace('-', '')}/{accession_no}-index.htm"
                )
                tenk_data = fetch_10k_text_and_link_from_index(doc_index_url)
                results.append({
                    "form_type": form_type,
                    "filing_date": filing_date,
                    "accession_no": accession_no,
                    "doc_index_url": doc_index_url,
                    "tenk_link": tenk_data["tenk_link"],
                    "tenk_text": tenk_data["tenk_text"],
                    "graphics": tenk_data["graphics"]
                })
    except Exception as e:
        print(f"Error processing filings for CIK {cik} in {year}:", e)
    return results

def main():
    try:
        # For testing, set a flag to only process one ticker and one year
        if TEST_MODE:
            ticker = "AAPL"  # Example ticker
            year = 2023
            tickers_data = get_all_tickers_info()
            cik = find_cik(ticker, tickers_data)
            if not cik:
                print(f"CIK not found for {ticker}")
                return

            print(f"Processing ticker: {ticker}, CIK: {cik}, Year: {year}")
            filings = fetch_10k_filings_for_year(cik, year)
            if not filings:
                print(f"No filings found for {ticker} in {year}.")
                return

            # Process only the first filing in the list
            filing = filings[0]
            # Chunk the text
            chunks = chunk_text_by_paragraphs(filing["tenk_text"], max_chars=3000)
            # Build a simple knowledge graph
            kg = build_knowledge_graph(chunks)

            # Structure the final JSON
            structured_data = {
                "company": {
                    "ticker": ticker,
                    "cik": cik
                },
                "filing": {
                    "year": year,
                    "form_type": filing["form_type"],
                    "filing_date": filing["filing_date"],
                    "accession_no": filing["accession_no"],
                    "doc_index_url": filing["doc_index_url"],
                    "tenk_link": filing["tenk_link"],
                    "tenk_text": filing["tenk_text"],
                    "graphics": filing["graphics"],
                    "text_chunks": [
                        {"chunk_id": f"chunk_{i+1}", "content": chunk}
                        for i, chunk in enumerate(chunks)
                    ],
                    "knowledge_graph": kg
                }
            }

            # Save the structured JSON
            output_dir = "sec_10k_data"
            os.makedirs(output_dir, exist_ok=True)
            output_path = os.path.join(output_dir, f"{ticker}_{year}_10K_graph.json")
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(structured_data, f, indent=2)
            print(f"Structured JSON saved to: {output_path}")

        else:
            # Normal mode: iterate multiple tickers/years from an Excel file, etc.
            excel_file = r"C:\path\to\your\Data_companies_list.xlsx"
            df = pd.read_excel(excel_file)
            tickers_data = get_all_tickers_info()
            if not tickers_data:
                print("No tickers data available. Exiting.")
                return
            years = range(2012, 2026)
            output_dir = "sec_10k_data"
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)
            os.makedirs(output_dir, exist_ok=True)

            for _, row in df.iterrows():
                ticker = str(row["Symbol"]).strip()
                cik = find_cik(ticker, tickers_data)
                if not cik:
                    print(f"CIK not found for {ticker}")
                    continue
                ticker_dir = os.path.join(output_dir, ticker)
                os.makedirs(ticker_dir, exist_ok=True)

                for y in tqdm(years, desc=f"Fetching 10-K for {ticker}"):
                    filings = fetch_10k_filings_for_year(cik, y)
                    if filings:
                        for filing in filings:
                            chunks = chunk_text_by_paragraphs(filing["tenk_text"], max_chars=3000)
                            kg = build_knowledge_graph(chunks)
                            structured_data = {
                                "company": {"ticker": ticker, "cik": cik},
                                "filing": {
                                    "year": y,
                                    "form_type": filing["form_type"],
                                    "filing_date": filing["filing_date"],
                                    "accession_no": filing["accession_no"],
                                    "doc_index_url": filing["doc_index_url"],
                                    "tenk_link": filing["tenk_link"],
                                    "tenk_text": filing["tenk_text"],
                                    "graphics": filing["graphics"],
                                    "text_chunks": [
                                        {"chunk_id": f"chunk_{i+1}", "content": chunk}
                                        for i, chunk in enumerate(chunks)
                                    ],
                                    "knowledge_graph": kg
                                }
                            }
                            # Save per filing
                            fname = f"{ticker}_{y}_{filing['accession_no']}_10K_graph.json"
                            save_path = os.path.join(ticker_dir, fname)
                            try:
                                with open(save_path, "w", encoding="utf-8") as f:
                                    json.dump(structured_data, f, indent=2)
                            except Exception as e:
                                print(f"Error saving file {save_path}:", e)
                    time.sleep(0.2)
    except Exception as e:
        print("Unexpected error in main execution:", e)

if __name__ == "__main__":
    main()
    print("Script executed successfully.")


Processing ticker: AAPL, CIK: 320193, Year: 2023
Structured JSON saved to: sec_10k_data\AAPL_2023_10K_graph.json
Script executed successfully.


Test

In [None]:
import os
import json
import subprocess
import sys
from time import sleep

# --- Auto-install required packages ---
def install_if_needed(pkg_name, import_name=None):
    try:
        __import__(import_name or pkg_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", pkg_name])

# Basic packages for our pipeline
for pkg, imp in [("langchain", None), ("langchain-community", None),
                 ("langchain-huggingface", "langchain_huggingface"),
                 ("transformers", None), ("sentence_transformers", "sentence_transformers"),
                 ("faiss-cpu", None)]:
    install_if_needed(pkg, imp)

# --- Import libraries ---
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# Try importing the new HuggingFacePipeline from langchain_huggingface, fallback if needed.
try:
    from langchain_huggingface import HuggingFacePipeline
except ImportError:
    from langchain.llms import HuggingFacePipeline

from sentence_transformers import SentenceTransformer, util
import transformers

# --- Set up free LLM using HuggingFacePipeline ---
model_name = "google/flan-t5-small"
# Enable sampling to use temperature (even if 0 yields greedy behavior)
pipe = transformers.pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=model_name,
    max_length=256,
    do_sample=True,
    temperature=0
)
llm = HuggingFacePipeline(pipeline=pipe)

# --- Set up embeddings for vector store and similarity ---
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Function to load SEC 10-K documents from JSON files ---
def load_documents(data_dir="sec_10k_data"):
    documents = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        filings = json.load(f)
                        for filing in filings:
                            text = (
                                f"Form: {filing.get('form_type')}. "
                                f"Date: {filing.get('filing_date')}. "
                                f"Accession: {filing.get('accession_no')}. "
                                f"URL: {filing.get('doc_url')}."
                            )
                            metadata = {"source": file_path}
                            documents.append({"text": text, "metadata": metadata})
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return documents

docs = load_documents()
if not docs:
    print("No documents loaded. Check your 'sec_10k_data' directory.")
else:
    print(f"Loaded {len(docs)} documents.")

# --- Create FAISS Vector Store ---
try:
    texts = [doc["text"] for doc in docs]
    metadatas = [doc["metadata"] for doc in docs]
    vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    print("Vector store created successfully.")
except Exception as e:
    print("Error creating vector store:", e)

# --- Set up the RetrievalQA chain ---
try:
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    print("RetrievalQA chain is ready.")
except Exception as e:
    print("Error setting up the QA chain:", e)

# --- Function to compute cosine similarity ---
def compute_similarity(answer, documents):
    try:
        answer_embedding = similarity_model.encode(answer, convert_to_tensor=True)
        sims = []
        for doc in documents:
            doc_embedding = similarity_model.encode(doc, convert_to_tensor=True)
            cosine_sim = util.cos_sim(answer_embedding, doc_embedding)
            sims.append(cosine_sim.item())
        if sims:
            avg_sim = sum(sims) / len(sims)
            return avg_sim, sims
        else:
            return 0, []
    except Exception as e:
        print("Error computing similarity:", e)
        return 0, []

# --- Interactive Query Loop ---
def interactive_query():
    print("\n--- RAG Query System ---")
    base_query = input("Enter your query about 10-K filings: ")
    query = base_query.strip()
    while True:
        try:
            answer = qa_chain.run(query)
            print("\nLLM Answer:\n", answer)
            
            # Retrieve top 3 documents for context
            retrieved_docs = vector_store.similarity_search(query, k=3)
            print("\nTop Retrieved Documents:")
            for i, doc in enumerate(retrieved_docs, start=1):
                print(f"\nDocument {i} (Source: {doc.metadata.get('source', 'N/A')}):")
                print(doc.page_content)
            
            # Compute cosine similarity for faithfulness metrics
            retrieved_texts = [doc.page_content for doc in retrieved_docs]
            avg_similarity, sims = compute_similarity(answer, retrieved_texts)
            print("\nFaithfulness Metrics:")
            print("Average Cosine Similarity:", avg_similarity)
            print("Individual Similarities:", sims)
        except Exception as e:
            print("Error during query processing:", e)
        
        refine = input("\nWould you like to refine your query with additional details? (yes/no): ").lower().strip()
        if refine in ['yes', 'y']:
            extra = input("Enter additional details (e.g., specific year, company name, etc.): ").strip()
            query = base_query + " " + extra
        else:
            break

interactive_query()

# Future improvements:
# - Fine-tune the LLM on your SEC filings data.
# - Optimize query formulation and ask follow-up questions automatically if details are missing.
# - Track and display top-searched topics for further analysis.


Device set to use cpu


Loaded 2746 documents.
Vector store created successfully.
RetrievalQA chain is ready.

--- RAG Query System ---


Below is the final, merged code. It integrates the following enhancements:

• A reranker using a free cross‑encoder for better document selection.

• Query decomposition to break complex queries into sub‑questions.

• An interactive, React‑style query loop that lets you refine queries with extra details.

Run this cell in your notebook to launch the enhanced RAG system using only free tools.




How This Code Works:

Auto-installation: It ensures all necessary packages are installed.

Vector Store & RetrievalQA: It loads SEC 10-K filing metadata, builds a FAISS vector store, and sets up a retrieval chain using a free LLM (google/flan-t5-small).

Reranking: After initial retrieval, it reranks the top documents using a cross-encoder for improved relevance.

Query Decomposition: For complex queries, it decomposes the question into sub‑questions.

Interactive Loop: You can iteratively refine your query (e.g., add a year or company name) and the system displays the answer, top documents, and faithfulness metrics.

In [None]:
import os
import json
import subprocess
import sys
from time import sleep

# --- Auto-install required packages ---
def install_if_needed(pkg_name, import_name=None):
    try:
        __import__(import_name or pkg_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", pkg_name])

# Basic packages for our pipeline
packages = [
    ("langchain", None),
    ("langchain-community", None),
    ("langchain-huggingface", "langchain_huggingface"),
    ("transformers", None),
    ("sentence_transformers", "sentence_transformers"),
    ("faiss-cpu", None)
]
for pkg, imp in packages:
    install_if_needed(pkg, imp)

# --- Import libraries ---
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# Try importing the new HuggingFacePipeline from langchain_huggingface; fallback if needed.
try:
    from langchain_huggingface import HuggingFacePipeline
except ImportError:
    from langchain.llms import HuggingFacePipeline

from sentence_transformers import SentenceTransformer, util, CrossEncoder
import transformers

# --- Set up free LLM using HuggingFacePipeline ---
model_name = "google/flan-t5-small"
pipe = transformers.pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=model_name,
    max_length=256,
    do_sample=True,
    temperature=0
)
llm = HuggingFacePipeline(pipeline=pipe)

# --- Set up embeddings for vector store and similarity ---
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Set up cross-encoder for reranking ---
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# --- Function to load SEC 10-K documents from JSON files ---
def load_documents(data_dir="sec_10k_data"):
    documents = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        filings = json.load(f)
                        for filing in filings:
                            text = (
                                f"Form: {filing.get('form_type')}. "
                                f"Date: {filing.get('filing_date')}. "
                                f"Accession: {filing.get('accession_no')}. "
                                f"URL: {filing.get('doc_url')}."
                            )
                            metadata = {"source": file_path}
                            documents.append({"text": text, "metadata": metadata})
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return documents

docs = load_documents()
if not docs:
    print("No documents loaded. Check your 'sec_10k_data' directory.")
else:
    print(f"Loaded {len(docs)} documents.")

# --- Create FAISS Vector Store ---
try:
    texts = [doc["text"] for doc in docs]
    metadatas = [doc["metadata"] for doc in docs]
    vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    print("Vector store created successfully.")
except Exception as e:
    print("Error creating vector store:", e)

# --- Set up the RetrievalQA chain ---
try:
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    print("RetrievalQA chain is ready.")
except Exception as e:
    print("Error setting up the QA chain:", e)

# --- Function to compute cosine similarity ---
def compute_similarity(answer, documents):
    try:
        answer_embedding = similarity_model.encode(answer, convert_to_tensor=True)
        sims = []
        for doc in documents:
            doc_embedding = similarity_model.encode(doc, convert_to_tensor=True)
            cosine_sim = util.cos_sim(answer_embedding, doc_embedding)
            sims.append(cosine_sim.item())
        if sims:
            avg_sim = sum(sims) / len(sims)
            return avg_sim, sims
        else:
            return 0, []
    except Exception as e:
        print("Error computing similarity:", e)
        return 0, []

# --- Function for reranking retrieved documents ---
def rerank_documents(query, docs, top_k=3):
    pairs = [(query, doc.page_content) for doc in docs]
    scores = reranker.predict(pairs)
    reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in reranked[:top_k]]

# --- Function for query decomposition ---
def decompose_query(query):
    prompt = f"Decompose this query into sub-questions to get more detailed answers: '{query}'"
    subqueries = llm(prompt)
    return [sq.strip() for sq in subqueries.split("\n") if sq.strip()]

# --- Interactive Query Loop with React Reflection ---
def interactive_query():
    print("\n--- RAG Query System with Reranking, Decomposition, and Reflection ---")
    base_query = input("Enter your query about 10-K filings: ").strip()
    query = base_query
    while True:
        try:
            # If query is complex, decompose it into sub-questions.
            if len(query.split()) > 10:
                sub_queries = decompose_query(query)
                print("\nSub-queries generated:")
                for idx, sub in enumerate(sub_queries, 1):
                    print(f"{idx}. {sub}")
            else:
                sub_queries = [query]
            
            # Retrieve a broader set of documents.
            initial_docs = vector_store.similarity_search(query, k=10)
            if not initial_docs:
                print("No relevant documents found.")
                break
            
            # Rerank documents using the cross-encoder.
            top_docs = rerank_documents(query, initial_docs, top_k=3)
            
            # Generate an answer using the RetrievalQA chain.
            answer = qa_chain.run(query)
            print("\nLLM Answer:\n", answer)
            
            # Display top retrieved documents.
            print("\nTop Retrieved Documents after Reranking:")
            for i, doc in enumerate(top_docs, start=1):
                print(f"\nDocument {i} (Source: {doc.metadata.get('source', 'N/A')}):")
                print(doc.page_content)
            
            # Compute cosine similarity as a proxy for faithfulness.
            retrieved_texts = [doc.page_content for doc in top_docs]
            avg_similarity, sims = compute_similarity(answer, retrieved_texts)
            print("\nFaithfulness Metrics:")
            print("Average Cosine Similarity:", avg_similarity)
            print("Individual Similarities:", sims)
        except Exception as e:
            print("Error during query processing:", e)
        
        refine = input("\nWould you like to refine your query with additional details? (yes/no): ").lower().strip()
        if refine in ['yes', 'y']:
            extra = input("Enter additional details (e.g., specific year, company name, etc.): ").strip()
            query = base_query + " " + extra
            print("Refining query...")
        else:
            break

interactive_query()

# Future improvements:
# - Fine-tune the LLM on your SEC filings data.
# - Optimize query formulation and ask follow-up questions automatically if details are missing.
# - Track and display top-searched topics for further analysis.


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loaded 2746 documents.
Vector store created successfully.
RetrievalQA chain is ready.

--- RAG Query System with Reranking, Decomposition, and Reflection ---
