## API Test

In [3]:
import sys
import subprocess
import time

# Upgrade pip first
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
except subprocess.CalledProcessError as e:
    print("Error upgrading pip:", e)

# Required packages for this code snippet
packages = ["pandas", "openpyxl", "requests"]

errors = []
start_time = time.time()

for pkg in packages:
    try:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", pkg])
    except subprocess.CalledProcessError as e:
        errors.append(f"{pkg}: {e}")
        print(f"Failed to install {pkg}. Continuing.")

elapsed_time = time.time() - start_time

print("\nInstallation completed.")
if errors:
    print("Encountered the following errors:")
    for err in errors:
        print(" -", err)
else:
    print("All packages installed successfully.")

print(f"Elapsed time: {elapsed_time:.2f} seconds.")


Installing pandas ...
Installing openpyxl ...
Installing requests ...

Installation completed.
All packages installed successfully.
Elapsed time: 6.53 seconds.


In [6]:
import sys
import subprocess

def install_if_needed(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", package])

for pkg in ["pandas", "openpyxl", "requests"]:
    install_if_needed(pkg)

import pandas as pd
import requests
import os
import json
from time import sleep

# Update headers with a proper User-Agent string (replace with your actual contact email)
HEADERS = {
    "User-Agent": "RAGFinancialReport/1.0 (niharskumar@gmail.com)"
}

def get_all_tickers_info():
    url = "https://www.sec.gov/files/company_tickers.json"
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print("Error fetching tickers info:", e)
        return {}

def find_cik(ticker, tickers_data):
    try:
        for key, info in tickers_data.items():
            if info["ticker"].lower() == ticker.lower():
                return info["cik_str"]
    except Exception as e:
        print("Error finding CIK for", ticker, ":", e)
    return None

def fetch_10k(cik, year):
    base_url = f"https://data.sec.gov/submissions/CIK{str(cik).zfill(10)}.json"
    try:
        r = requests.get(base_url, headers=HEADERS, timeout=10)
        if r.status_code != 200:
            print(f"Failed to fetch data for CIK {cik} in {year}: status code {r.status_code}")
            return None
        data = r.json()
    except Exception as e:
        print(f"Error fetching data for CIK {cik} in {year}:", e)
        return None

    results = []
    try:
        recent = data.get("filings", {}).get("recent", {})
        forms = recent.get("form", [])
        dates = recent.get("filingDate", [])
        accessions = recent.get("accessionNumber", [])

        for i in range(len(forms)):
            form_type = forms[i]
            filing_date = dates[i]
            accession_no = accessions[i]
            if form_type == "10-K" and filing_date.startswith(str(year)):
                doc_url = (
                    f"https://www.sec.gov/Archives/edgar/data/"
                    f"{int(cik)}/{accession_no.replace('-', '')}/{accession_no}-index.htm"
                )
                results.append({
                    "form_type": form_type,
                    "filing_date": filing_date,
                    "accession_no": accession_no,
                    "doc_url": doc_url
                })
    except Exception as e:
        print(f"Error processing filings for CIK {cik} in {year}:", e)
        return None

    return results if results else None

def main():
    try:
        excel_file = r"C:\Users\Nih4r\Documents\GitHub\RAG_Financial_Report\notebooks\data\Data_companies_list.xlsx"
        try:
            df = pd.read_excel(excel_file)
        except Exception as e:
            print("Error reading Excel file:", excel_file, e)
            return

        tickers_data = get_all_tickers_info()
        if not tickers_data:
            print("No tickers data available. Exiting.")
            return

        years = range(2012, 2026)
        output_dir = "sec_10k_data"
        os.makedirs(output_dir, exist_ok=True)

        for _, row in df.iterrows():
            ticker = str(row["Symbol"]).strip()
            cik = find_cik(ticker, tickers_data)
            if not cik:
                print(f"CIK not found for {ticker}")
                continue

            ticker_dir = os.path.join(output_dir, ticker)
            os.makedirs(ticker_dir, exist_ok=True)

            for y in years:
                filings = fetch_10k(cik, y)
                if filings:
                    save_path = os.path.join(ticker_dir, f"{ticker}_{y}_10K.json")
                    try:
                        with open(save_path, "w", encoding="utf-8") as f:
                            json.dump(filings, f, indent=2)
                    except Exception as e:
                        print(f"Error saving file {save_path}:", e)
                sleep(0.2)
    except Exception as e:
        print("Unexpected error in main execution:", e)

if __name__ == "__main__":
    main()
    print("Script executed successfully.")

CIK not found for GATO
CIK not found for HYZN
CIK not found for FFIE
CIK not found for FFIEW
CIK not found for WESTW
CIK not found for PXD
CIK not found for CHKEZ
CIK not found for TELZ
CIK not found for CHKEL
CIK not found for NS
CIK not found for CEI
CIK not found for IVCP
CIK not found for NVTA
CIK not found for VAXX
CIK not found for AGLE
CIK not found for B
CIK not found for NCR
CIK not found for FRG
CIK not found for SUNW
CIK not found for CTIB
CIK not found for HTIA
CIK not found for WE
CIK not found for MDRRP
CIK not found for AYX
CIK not found for PAYOW
CIK not found for CPSI
CIK not found for MVLA
CIK not found for GSD
CIK not found for TBC
CIK not found for HEAR
CIK not found for ITI
CIK not found for AESC
CIK not found for PNM
CIK not found for PEGY
Script executed successfully.


In [None]:
import os
import json
import subprocess
import sys
from time import sleep

# --- Auto-install required packages ---
def install_if_needed(pkg_name, import_name=None):
    try:
        __import__(import_name or pkg_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", pkg_name])

# Basic packages for our pipeline
for pkg, imp in [("langchain", None), ("langchain-community", None),
                 ("langchain-huggingface", "langchain_huggingface"),
                 ("transformers", None), ("sentence_transformers", "sentence_transformers"),
                 ("faiss-cpu", None)]:
    install_if_needed(pkg, imp)

# --- Import libraries ---
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# Try importing the new HuggingFacePipeline from langchain_huggingface, fallback if needed.
try:
    from langchain_huggingface import HuggingFacePipeline
except ImportError:
    from langchain.llms import HuggingFacePipeline

from sentence_transformers import SentenceTransformer, util
import transformers

# --- Set up free LLM using HuggingFacePipeline ---
model_name = "google/flan-t5-small"
# Enable sampling to use temperature (even if 0 yields greedy behavior)
pipe = transformers.pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=model_name,
    max_length=256,
    do_sample=True,
    temperature=0
)
llm = HuggingFacePipeline(pipeline=pipe)

# --- Set up embeddings for vector store and similarity ---
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Function to load SEC 10-K documents from JSON files ---
def load_documents(data_dir="sec_10k_data"):
    documents = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        filings = json.load(f)
                        for filing in filings:
                            text = (
                                f"Form: {filing.get('form_type')}. "
                                f"Date: {filing.get('filing_date')}. "
                                f"Accession: {filing.get('accession_no')}. "
                                f"URL: {filing.get('doc_url')}."
                            )
                            metadata = {"source": file_path}
                            documents.append({"text": text, "metadata": metadata})
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return documents

docs = load_documents()
if not docs:
    print("No documents loaded. Check your 'sec_10k_data' directory.")
else:
    print(f"Loaded {len(docs)} documents.")

# --- Create FAISS Vector Store ---
try:
    texts = [doc["text"] for doc in docs]
    metadatas = [doc["metadata"] for doc in docs]
    vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    print("Vector store created successfully.")
except Exception as e:
    print("Error creating vector store:", e)

# --- Set up the RetrievalQA chain ---
try:
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    print("RetrievalQA chain is ready.")
except Exception as e:
    print("Error setting up the QA chain:", e)

# --- Function to compute cosine similarity ---
def compute_similarity(answer, documents):
    try:
        answer_embedding = similarity_model.encode(answer, convert_to_tensor=True)
        sims = []
        for doc in documents:
            doc_embedding = similarity_model.encode(doc, convert_to_tensor=True)
            cosine_sim = util.cos_sim(answer_embedding, doc_embedding)
            sims.append(cosine_sim.item())
        if sims:
            avg_sim = sum(sims) / len(sims)
            return avg_sim, sims
        else:
            return 0, []
    except Exception as e:
        print("Error computing similarity:", e)
        return 0, []

# --- Interactive Query Loop ---
def interactive_query():
    print("\n--- RAG Query System ---")
    base_query = input("Enter your query about 10-K filings: ")
    query = base_query.strip()
    while True:
        try:
            answer = qa_chain.run(query)
            print("\nLLM Answer:\n", answer)
            
            # Retrieve top 3 documents for context
            retrieved_docs = vector_store.similarity_search(query, k=3)
            print("\nTop Retrieved Documents:")
            for i, doc in enumerate(retrieved_docs, start=1):
                print(f"\nDocument {i} (Source: {doc.metadata.get('source', 'N/A')}):")
                print(doc.page_content)
            
            # Compute cosine similarity for faithfulness metrics
            retrieved_texts = [doc.page_content for doc in retrieved_docs]
            avg_similarity, sims = compute_similarity(answer, retrieved_texts)
            print("\nFaithfulness Metrics:")
            print("Average Cosine Similarity:", avg_similarity)
            print("Individual Similarities:", sims)
        except Exception as e:
            print("Error during query processing:", e)
        
        refine = input("\nWould you like to refine your query with additional details? (yes/no): ").lower().strip()
        if refine in ['yes', 'y']:
            extra = input("Enter additional details (e.g., specific year, company name, etc.): ").strip()
            query = base_query + " " + extra
        else:
            break

interactive_query()

# Future improvements:
# - Fine-tune the LLM on your SEC filings data.
# - Optimize query formulation and ask follow-up questions automatically if details are missing.
# - Track and display top-searched topics for further analysis.


Device set to use cpu


Loaded 2746 documents.
Vector store created successfully.
RetrievalQA chain is ready.

--- RAG Query System ---


Below is the final, merged code. It integrates the following enhancements:

• A reranker using a free cross‑encoder for better document selection.

• Query decomposition to break complex queries into sub‑questions.

• An interactive, React‑style query loop that lets you refine queries with extra details.

Run this cell in your notebook to launch the enhanced RAG system using only free tools.




How This Code Works:

Auto-installation: It ensures all necessary packages are installed.

Vector Store & RetrievalQA: It loads SEC 10-K filing metadata, builds a FAISS vector store, and sets up a retrieval chain using a free LLM (google/flan-t5-small).

Reranking: After initial retrieval, it reranks the top documents using a cross-encoder for improved relevance.

Query Decomposition: For complex queries, it decomposes the question into sub‑questions.

Interactive Loop: You can iteratively refine your query (e.g., add a year or company name) and the system displays the answer, top documents, and faithfulness metrics.

In [None]:
import os
import json
import subprocess
import sys
from time import sleep

# --- Auto-install required packages ---
def install_if_needed(pkg_name, import_name=None):
    try:
        __import__(import_name or pkg_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", pkg_name])

# Basic packages for our pipeline
packages = [
    ("langchain", None),
    ("langchain-community", None),
    ("langchain-huggingface", "langchain_huggingface"),
    ("transformers", None),
    ("sentence_transformers", "sentence_transformers"),
    ("faiss-cpu", None)
]
for pkg, imp in packages:
    install_if_needed(pkg, imp)

# --- Import libraries ---
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# Try importing the new HuggingFacePipeline from langchain_huggingface; fallback if needed.
try:
    from langchain_huggingface import HuggingFacePipeline
except ImportError:
    from langchain.llms import HuggingFacePipeline

from sentence_transformers import SentenceTransformer, util, CrossEncoder
import transformers

# --- Set up free LLM using HuggingFacePipeline ---
model_name = "google/flan-t5-small"
pipe = transformers.pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=model_name,
    max_length=256,
    do_sample=True,
    temperature=0
)
llm = HuggingFacePipeline(pipeline=pipe)

# --- Set up embeddings for vector store and similarity ---
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Set up cross-encoder for reranking ---
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# --- Function to load SEC 10-K documents from JSON files ---
def load_documents(data_dir="sec_10k_data"):
    documents = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        filings = json.load(f)
                        for filing in filings:
                            text = (
                                f"Form: {filing.get('form_type')}. "
                                f"Date: {filing.get('filing_date')}. "
                                f"Accession: {filing.get('accession_no')}. "
                                f"URL: {filing.get('doc_url')}."
                            )
                            metadata = {"source": file_path}
                            documents.append({"text": text, "metadata": metadata})
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return documents

docs = load_documents()
if not docs:
    print("No documents loaded. Check your 'sec_10k_data' directory.")
else:
    print(f"Loaded {len(docs)} documents.")

# --- Create FAISS Vector Store ---
try:
    texts = [doc["text"] for doc in docs]
    metadatas = [doc["metadata"] for doc in docs]
    vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    print("Vector store created successfully.")
except Exception as e:
    print("Error creating vector store:", e)

# --- Set up the RetrievalQA chain ---
try:
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    print("RetrievalQA chain is ready.")
except Exception as e:
    print("Error setting up the QA chain:", e)

# --- Function to compute cosine similarity ---
def compute_similarity(answer, documents):
    try:
        answer_embedding = similarity_model.encode(answer, convert_to_tensor=True)
        sims = []
        for doc in documents:
            doc_embedding = similarity_model.encode(doc, convert_to_tensor=True)
            cosine_sim = util.cos_sim(answer_embedding, doc_embedding)
            sims.append(cosine_sim.item())
        if sims:
            avg_sim = sum(sims) / len(sims)
            return avg_sim, sims
        else:
            return 0, []
    except Exception as e:
        print("Error computing similarity:", e)
        return 0, []

# --- Function for reranking retrieved documents ---
def rerank_documents(query, docs, top_k=3):
    pairs = [(query, doc.page_content) for doc in docs]
    scores = reranker.predict(pairs)
    reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in reranked[:top_k]]

# --- Function for query decomposition ---
def decompose_query(query):
    prompt = f"Decompose this query into sub-questions to get more detailed answers: '{query}'"
    subqueries = llm(prompt)
    return [sq.strip() for sq in subqueries.split("\n") if sq.strip()]

# --- Interactive Query Loop with React Reflection ---
def interactive_query():
    print("\n--- RAG Query System with Reranking, Decomposition, and Reflection ---")
    base_query = input("Enter your query about 10-K filings: ").strip()
    query = base_query
    while True:
        try:
            # If query is complex, decompose it into sub-questions.
            if len(query.split()) > 10:
                sub_queries = decompose_query(query)
                print("\nSub-queries generated:")
                for idx, sub in enumerate(sub_queries, 1):
                    print(f"{idx}. {sub}")
            else:
                sub_queries = [query]
            
            # Retrieve a broader set of documents.
            initial_docs = vector_store.similarity_search(query, k=10)
            if not initial_docs:
                print("No relevant documents found.")
                break
            
            # Rerank documents using the cross-encoder.
            top_docs = rerank_documents(query, initial_docs, top_k=3)
            
            # Generate an answer using the RetrievalQA chain.
            answer = qa_chain.run(query)
            print("\nLLM Answer:\n", answer)
            
            # Display top retrieved documents.
            print("\nTop Retrieved Documents after Reranking:")
            for i, doc in enumerate(top_docs, start=1):
                print(f"\nDocument {i} (Source: {doc.metadata.get('source', 'N/A')}):")
                print(doc.page_content)
            
            # Compute cosine similarity as a proxy for faithfulness.
            retrieved_texts = [doc.page_content for doc in top_docs]
            avg_similarity, sims = compute_similarity(answer, retrieved_texts)
            print("\nFaithfulness Metrics:")
            print("Average Cosine Similarity:", avg_similarity)
            print("Individual Similarities:", sims)
        except Exception as e:
            print("Error during query processing:", e)
        
        refine = input("\nWould you like to refine your query with additional details? (yes/no): ").lower().strip()
        if refine in ['yes', 'y']:
            extra = input("Enter additional details (e.g., specific year, company name, etc.): ").strip()
            query = base_query + " " + extra
            print("Refining query...")
        else:
            break

interactive_query()

# Future improvements:
# - Fine-tune the LLM on your SEC filings data.
# - Optimize query formulation and ask follow-up questions automatically if details are missing.
# - Track and display top-searched topics for further analysis.
