## Installing Dependencies

In [None]:
!pip install langchain langchain-openai langchain-community
!pip install sentence-transformers
!pip install beautifulsoup4 requests numpy
!pip install faiss-cpu
!pip install numpy langchain-text-splitters langchain-google-genai scikit-learn
!pip install lxml
!pip install huggingface_hub[hf_xet]


In [1]:
import os
import time
import requests
import numpy as np
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
from langchain_google_genai import ChatGoogleGenerativeAI
import json
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# HuggingFace TOKEN

os.environ["HUGGINGFACE_HUB_TOKEN"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# Google API KEY
os.environ["GOOGLE_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"



# Optional

# Azure Open-AI Credetials
# api_key="xxxxxxxxxxxxxxxxxxxxXXXXXXXXXXX"
# endpoint="xXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# deployment="xxxxxxxxxxxxXXXXXXXXXXXXXXXXXXXXX"
# version="xXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

## Data Collection

In [3]:
COMPANIES = {"GOOGL": "0001652044", "MSFT": "0000789019", "NVDA": "0001045810"}
YEARS = ["2022", "2023", "2024"]
HEADERS = {"User-Agent": "Company Research/1.0 (your.email@example.com)"}


def download_10k(ticker, cik, year):
    try:
        url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        response = requests.get(url, headers=HEADERS)
        data = response.json()
        filings = data["filings"]["recent"]
        
        for i, form in enumerate(filings["form"]):
            if form == "10-K":
                report_date = filings.get("reportDate", [None] * len(filings["form"]))[i]
                
                if report_date and report_date.startswith(year):
                    acc_num = filings["accessionNumber"][i].replace("-", "")
                    primary_doc = filings["primaryDocument"][i]
                    
                    base_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/"
                    file_url = base_url + primary_doc
                    
                    response = requests.get(file_url, headers=HEADERS)
                    response.raise_for_status()
                    
                    filename = f"{ticker}_{year}.html"
                    with open(f"filings/{filename}", "wb") as f:
                        f.write(response.content)
                    
                    print(f"+=+=+=+= Downloaded {filename} (fiscal year {year}) +=+=+=+=")
                    return True
                
    except Exception as e:
        print(f"ERROR {ticker} {year}: {e}")
        return False

os.makedirs("filings", exist_ok=True)

for ticker, cik in COMPANIES.items():
    for year in YEARS:
        download_10k(ticker, cik, year)
        time.sleep(0.1)  # Be respectful to SEC servers

print("\nDownload complete!")

+=+=+=+= Downloaded GOOGL_2022.html (fiscal year 2022) +=+=+=+=
+=+=+=+= Downloaded GOOGL_2023.html (fiscal year 2023) +=+=+=+=
+=+=+=+= Downloaded GOOGL_2024.html (fiscal year 2024) +=+=+=+=
+=+=+=+= Downloaded MSFT_2022.html (fiscal year 2022) +=+=+=+=
+=+=+=+= Downloaded MSFT_2023.html (fiscal year 2023) +=+=+=+=
+=+=+=+= Downloaded MSFT_2024.html (fiscal year 2024) +=+=+=+=
+=+=+=+= Downloaded NVDA_2022.html (fiscal year 2022) +=+=+=+=
+=+=+=+= Downloaded NVDA_2023.html (fiscal year 2023) +=+=+=+=
+=+=+=+= Downloaded NVDA_2024.html (fiscal year 2024) +=+=+=+=

Download complete!


##  Parse HTML

In [4]:
documents = []
for fname in os.listdir("filings"):
    
    ticker, year_ext = fname.split("_")
    year = year_ext.split(".html")[0]
    path = os.path.join("filings", fname)
    with open(path, "rb") as f:
        html = f.read()
    soup = BeautifulSoup(html, "lxml")
    
    for tag in soup(["script", "style"]):
        tag.decompose()
    text = soup.get_text(separator="\n")
    
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    content = "\n".join(lines)
    documents.append(Document(page_content=content, metadata={"company": ticker, "year": year}))
    print(f"Extracted text for {ticker} {year} (chars={len(content)})")



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "lxml")


Extracted text for GOOGL 2022 (chars=349560)
Extracted text for GOOGL 2023 (chars=371213)
Extracted text for GOOGL 2024 (chars=381674)
Extracted text for MSFT 2022 (chars=378985)
Extracted text for MSFT 2023 (chars=373753)
Extracted text for MSFT 2024 (chars=390462)
Extracted text for NVDA 2022 (chars=325456)
Extracted text for NVDA 2023 (chars=339219)
Extracted text for NVDA 2024 (chars=358853)


## Chunking: Split documents into semantic chunk

In [5]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"Total chunks created: {len(chunks)} ")


Total chunks created: 2660 


## TFIDF

In [6]:
chunk_texts = [chunk.page_content for chunk in chunks]

tfidf = TfidfVectorizer(
    lowercase=True,                     
    stop_words="english",               
    ngram_range=(1, 2), 
    max_features=5000,                  
    norm="l2",                 
    sublinear_tf=True                  
)
tfidf_embeddings = tfidf.fit_transform(chunk_texts).toarray()


# Embeddings & FAISS Vector Store

In [7]:

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2',token=os.getenv("HUGGINGFACE_HUB_TOKEN"))
# model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", token=os.getenv("HUGGINGFACE_HUB_TOKEN"))  


In [8]:

sem_embeddings = model.encode(chunk_texts, convert_to_numpy=True)

fused_embeddings = np.hstack([tfidf_embeddings, sem_embeddings])


In [9]:
d = fused_embeddings.shape[1]
index = faiss.IndexFlatL2(d) 
index.add(fused_embeddings)        
print(f"FAISS index populated with {index.ntotal} vectors.")

FAISS index populated with 2660 vectors.


In [10]:
def retriver(sq, k = 3):
    context = []
    t_emb = tfidf.transform([sq]).toarray()
    s_emb = model.encode([sq])
    q_emb = np.hstack([t_emb, s_emb])
    D, I = index.search(q_emb, k=3)  
    for idx in I[0]:
        context.append(chunks[idx])
    return context

retriver("MSFT Segment revenue and operating income")

[Document(metadata={'company': 'MSFT', 'year': '2022'}, page_content='•\nSearch and news advertising.\n93\nPART II\nItem 8\nRevenue and costs are generally directly attributed to our segments. However, due to the integrated structure of our business, certain revenue recognized and costs incurred by one segment may benefit other segments. Revenue from certain contracts is allocated among the segments based on the relative value of the underlying products and services, which can include allocation based on actual prices charged, prices when sold separately, or estimated costs plus a profit margin. Cost of revenue is allocated in certain cases based on a relative revenue methodology. Operating expenses that are allocated primarily include those relating to marketing of products and services from which multiple segments benefit and are generally allocated based on relative gross margin.\nIn addition, certain costs incurred at a corporate level that are identifiable and that benefit our seg

## LLM Model initialization

In [11]:

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

# from langchain_openai import AzureChatOpenAI
# llm = AzureChatOpenAI(
#             temperature=0.3,
#             azure_endpoint=endpoint,
#             api_key=api_key,
#             deployment_name=deployment,
#             api_version=version
#         )

llm.invoke("hi")


AIMessage(content='Hi there! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []}, id='run--eb6c034a-33b3-45ce-8f29-f34bee5bef4b-0', usage_metadata={'input_tokens': 2, 'output_tokens': 34, 'total_tokens': 36, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 24}})

In [12]:
class Query(BaseModel):
    query: str = Field(description = "context rich relevant queries")
    
class QueriesExpension(BaseModel):
    queries: List[Query] = Field(description = "List of 5-7 unique subqueries")


class finalanswer(BaseModel):
    answer: str = Field(description = "Final answer to user's query based on context")
    reasoning: str = Field(description = "reasoning behind the answer")
    

In [13]:
def answer_query(query):
    state = {"query": query, "sub_queries": [], "retrieved": [], "answer": ""}
    
    # +=+=+=+=+=+=+=+=+=+=+=+=+=+= Subqueires retrival

    prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an expert SEC EDGAR filing analyst specializing in 10-K documents for GOOGL, MSFT, and NVDA. 

    Your task is to decompose user queries into 7-10 focused sub-queries that will retrieve the most relevant information from SEC 10-K filings. Each sub-query should:

    1. Use precise SEC filing terminology (e.g., "revenue recognition", "operating segments", "risk factors", "material agreements", "liquidity and capital resources")
    2. Target specific 10-K sections when relevant (Item 1A Risk Factors, Item 7 MD&A, Item 8 Financial Statements, etc.)
    3. Be company-specific when beneficial (mention ticker symbols, business segments, key products)
    4. Focus on quantifiable metrics and financial data points
    5. Include temporal context (year-over-year, quarterly trends, fiscal periods)

    Format each sub-query as a standalone search that would find relevant passages in 10-K documents. Prioritize queries that would surface:
    - Financial performance metrics and KPIs
    - Risk disclosures and regulatory matters  
    - Business segment analysis and geographic revenue
    - Competitive positioning and market dynamics
    - Capital allocation and investment strategies
    - Operational challenges and growth drivers

    Example sub-query style: "NVDA data center revenue growth fiscal 2024 geographic breakdown Asia-Pacific, MSFT revenue by operating segments fiscal 2023 Item 7 MD&A"
    
    """),
        ("human", query)
    ])
    response = llm.with_structured_output(QueriesExpension).invoke(prompt.format_prompt())
   
    sub_queries = [s.query.strip() for s in response.queries if s.query.strip()]
    state["sub_queries"] = sub_queries
    
    
    # +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= Context Retrival
    sources = []
    for sq in sub_queries:
        sources.extend(retriver(sq, 5))
    state["retrieved"] = sources
    print("Generated Subqueries...")
    print(sub_queries)
    
    context = ""
    for chunk in sources:
        comp = chunk.metadata["company"]
        yr = chunk.metadata["year"]
        excerpt = chunk.page_content
        context += f"{comp} {yr} filing excerpt: \"{excerpt}\"\n"
    
    
    
    # +=+=+=+=+=+=+=+=+=+=+=+=+=   LLM Invocation
    prompt_answer = ChatPromptTemplate.from_messages([
        ("system", f"Please provide the answer for the user query using given context :`{context}`, answer must be grounded in context, and must provide the reasoning behind the answer."),
        ("human", query)
    ])
    final_resp = llm.with_structured_output(finalanswer).invoke(prompt_answer.format_prompt())
    
    if not final_resp:
        final_resp = finalanswer(answer="N/A",reasoning="N/A")
    
    answer_text = final_resp.answer.strip()
    state["answer"] = answer_text
    sources_list = []
    for chunk in sources:
        sources_list.append({
            "company": chunk.metadata["company"],
            "year": chunk.metadata["year"],
            "excerpt": chunk.page_content.replace("\n", " ") ,
            "page": None
        })
    result = {
        "query": query,
        "answer": answer_text,
        "reasoning": final_resp.reasoning,
        "sub_queries": sub_queries,
        "sources": sources_list
    }
    return result


In [14]:
queries = [
    
    "Which company had the highest operating margin in 2023?",
    
    
    "How did NVIDIA's data center revenue grow from 2022 to 2023?",
    "What percentage of Google's revenue came from cloud in 2023?",
    "Compare cloud revenue growth rates across all three companies from 2022 to 2023.",
    "Compare AI investments mentioned by all three companies in their 2024 10-Ks",
    "What was MSFT total revenue in 2023?",
]
os.makedirs("./output",exist_ok = True)

for q in queries:
    result = answer_query(q)
    with open(f"./output/{q[:15]}.json","w") as f:
        json.dump(result, f)
    print("\n+=+=+=+=+= Answer +=+=+=+=+=\n",result["answer"])
    print("\n+=+=+=+=+= Reasoning +=+=+=+=+=\n",result["reasoning"])


### Results are saved under ./output dir in desired format 

Generated Subqueries...
['GOOGL operating income total revenue fiscal 2023 Item 8 Financial Statements', 'GOOGL operating margin trends fiscal 2023 Item 7 MD&A', 'MSFT operating income total revenue fiscal 2023 Item 8 Financial Statements', 'MSFT operating margin analysis fiscal 2023 Item 7 MD&A', 'NVDA operating income total net revenue fiscal 2023 Item 8 Financial Statements', 'NVDA operating margin performance fiscal 2023 Item 7 MD&A', 'GOOGL MSFT NVDA consolidated statements of operations fiscal 2023']

+=+=+=+=+= Answer +=+=+=+=+=
 Based on the provided context, only NVIDIA's operating margin for the fiscal year ended January 29, 2023 (which corresponds to 2023) is explicitly stated as 15.6%. The operating margins for Google (GOOGL) and Microsoft (MSFT) for 2023 are not available in the given excerpts, as the necessary total revenue figures to calculate them are not provided.

+=+=+=+=+= Reasoning +=+=+=+=+=
 The NVDA 2024 filing excerpt states that for the 'Year Ended Jan 29, 202