In [3]:
# %pip install secedgar

In [9]:
import pandas as pd
import numpy as np
import requests

In [127]:
# create request header
headers = {'User-Agent': "Kevin.Peterson@bainbridge.com"}

ticker = 'NVDA'
filing_type = '10-Q' # 10-Q, 10-K

latest_nth_report = 0

In [128]:
# filing = WebScrape(headers, ticker, filing_type, latest_nth_report)
# final_text = filing.extract_mda_section()

In [None]:
# get all companies data
companyTickers = requests.get(
    "https://www.sec.gov/files/company_tickers.json",
    headers=headers
    )

ticker_json = pd.DataFrame.from_dict(companyTickers.json(),orient='index')

# add leading zeros to CIK
ticker_json['cik_str'] = ticker_json['cik_str'].astype(str).str.zfill(10)
ticker_cik_dic = dict(zip(ticker_json['ticker'], ticker_json['cik_str']))
cik = ticker_cik_dic[ticker]

In [130]:
filingMetadata = requests.get(
    f'https://data.sec.gov/submissions/CIK{cik}.json',
    headers=headers
    )

recent_forms = pd.DataFrame.from_dict(
             filingMetadata.json()['filings']['recent']
             )
recent_forms['accessionNumber'] = recent_forms['accessionNumber'].str.replace("-","")
target_forms = recent_forms.loc[recent_forms.form == filing_type].reset_index(drop=True)
target_forms.sort_values('reportDate', ascending=False)

accesion_number = target_forms.loc[latest_nth_report,'accessionNumber']
document_suffix = target_forms.loc[latest_nth_report,'primaryDocument']
url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accesion_number}/{document_suffix}'
# get company facts data
single_filing = requests.get(url, headers=headers)
filing_summary_response = single_filing.content.decode("utf-8")

In [131]:
import re
import requests
from bs4 import BeautifulSoup

# def extract_table_sections_from_response(response):
#     """
#     Uses response.content (with the lxml parser) to preserve style attributes,
#     then finds all <div> elements whose style attribute matches the regular expression for "margin-bottom:1pt".
#     For each matching div, it extracts its inner HTML (all direct children) as an HTML fragment and removes the div.
    
#     Args:
#         response (requests.Response): The HTTP response from requests.get.
    
#     Returns:
#         tuple: (modified_soup, list_of_table_section_fragments)
#             - modified_soup: a BeautifulSoup object (with the extracted divs removed)
#             - list_of_table_section_fragments: a list of HTML strings representing each extracted table section.
#     """
#     import warnings
#     warnings.filterwarnings("ignore")
    
#     # Use the lxml parser to better preserve style attributes.
#     soup = BeautifulSoup(response.content, "lxml")
#     table_sections = []
#     candidate_divs = []
#     margin_bottom_re = re.compile(r"margin-bottom\s*:\s*1pt;?", re.IGNORECASE)
    
#     all_divs = soup.find_all("div")
#     for div in all_divs:
#         style = div.get("style", "")
#         if style and margin_bottom_re.search(style):
#             candidate_divs.append(div)
    
#     # For each candidate, extract its inner HTML and remove the div.
#     for idx, div in enumerate(candidate_divs, start=1):
#         inner_xml = div.decode_contents().strip()
#         table_sections.append(inner_xml)
#         div.decompose()
    
#     return soup, table_sections

def extract_table_sections_from_response(response):
    """Extract table sections from HTML response and remove them from the document unless they contain keywords."""
    soup = BeautifulSoup(response.content, "lxml")
    table_sections = []
    table_style_re = re.compile(r"margin-bottom\s*:\s*\d+pt", re.IGNORECASE)
    keyword_re = re.compile(r"Item\s*\d|Management’s Discussion and Analysis", re.IGNORECASE)
    
    all_tables = soup.find_all("table")
    for table in all_tables:
        if table_style_re.search(table.get("style", "")):
            # Check if the table contains key phrases in its spans
            if any(keyword_re.search(span.get_text()) for span in table.find_all("span")):
                continue  # Skip removal if key phrase is found
            
            table_sections.append(table.decode_contents().strip())
            table.decompose()
    
    return soup, table_sections


def find_last_div_containing_text(soup, text_pattern):
    """
    Finds the last <div> element (with a style attribute) that contains text matching the given pattern.
    
    Args:
        soup (BeautifulSoup): The parsed HTML document.
        text_pattern (str): The regex pattern to search for.
    
    Returns:
        BeautifulSoup element or None: The last matching div, or None if not found.
    """
    matching_divs = [div for div in soup.find_all("div", style=True)
                     if re.search(text_pattern, div.get_text(), re.IGNORECASE)]
    # print("DEBUG: find_last_div_containing_text found", len(matching_divs), "matching divs.")
    return matching_divs[-1] if matching_divs else None

def extract_text_until_next_item(start_div, pattern):
    """
    Starting at start_div, extracts text from that div until reaching a sibling <div> whose text matches the stop pattern.
    
    Args:
        start_div (BeautifulSoup element): The starting <div> (e.g., the MD&A header).
        pattern (str): Regex pattern that marks the stopping point (e.g., the next item header).
    
    Returns:
        str: Concatenated text from start_div until the stopping condition.
    """
    if not start_div:
        return "Desired section not found."
    
    item_text = []
    current = start_div
    while current:
        item_text.append(current.get_text(strip=True))
        current = current.find_next_sibling("div")
        if current and re.search(pattern, current.get_text(), re.IGNORECASE):
            break
    full_text = " ".join(item_text).strip().replace("\xa0", "")
    return full_text

def retain_text_after_last_occurrence(text, pattern):
    """
    Retains only the text following the last occurrence of the specified pattern.
    
    Args:
        text (str): The full extracted text.
        pattern (str): Regex pattern to locate the marker.
    
    Returns:
        str: The text following the last occurrence of the marker.
    """
    matches = list(re.finditer(pattern, text, re.IGNORECASE))
    if matches:
        # Slice the text after the end of the last match.
        return text[matches[-1].end()-5:].strip()
    return text

In [132]:
# Determine the regex patterns based on the form type
if target_forms.loc[latest_nth_report, 'form'] == '10-Q':
    current_item_pattern = r"Item\s*2[^\w]+Management.*?Discussion.*?and.*?Analysis"
    next_item_pattern = r"Item\s*3"
    last_item_pattern = r"Item\s*2"
elif target_forms.loc[latest_nth_report, 'form'] == '10-K':
    current_item_pattern = r"Item\s*7[^\w]+Management.*?Discussion.*?and.*?Analysis"
    next_item_pattern = r"Item\s*8"
    last_item_pattern = r"Item\s*7"

# Fetch and parse the document
try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()
except requests.RequestException as e:
    print(f"Error fetching URL: {e}")
    response = None

# Step 1: Extract table sections and remove them from the HTML
modified_soup, table_fragments = extract_table_sections_from_response(response)    

# Step 2: Find the last <div> containing the MD&A header text.
last_item_div = find_last_div_containing_text(modified_soup, current_item_pattern)

# Step 3: Extract text from the found div until the next item (e.g., "Item 3")
extracted_text = extract_text_until_next_item(last_item_div, next_item_pattern)

# Step 3: Retain only the text after the last occurrence of the marker (current item pattern)
final_text = retain_text_after_last_occurrence(extracted_text, last_item_pattern)



In [None]:

final_text

In [98]:
filingMetadata = requests.get(
    f'https://data.sec.gov/submissions/CIK{cik}.json',
    headers=headers
    )

# dictionary to dataframe
allForms = pd.DataFrame.from_dict(
             filingMetadata.json()['filings']['recent']
             )





# # filing metadata
# companyFacts.json()['facts']['dei'][
#     'EntityCommonStockSharesOutstanding']
# companyFacts.json()['facts']['dei'][
#     'EntityCommonStockSharesOutstanding'].keys()
# companyFacts.json()['facts']['dei'][
#     'EntityCommonStockSharesOutstanding']['units']
# companyFacts.json()['facts']['dei'][
#     'EntityCommonStockSharesOutstanding']['units']['shares']
# companyFacts.json()['facts']['dei'][
#     'EntityCommonStockSharesOutstanding']['units']['shares'][0]

# # concept data // financial statement line items
# companyFacts.json()['facts']['us-gaap']
# companyFacts.json()['facts']['us-gaap'].keys()

# # different amounts of data available per concept
# companyFacts.json()['facts']['us-gaap']['AccountsPayable']
# companyFacts.json()['facts']['us-gaap']['Revenues']
# companyFacts.json()['facts']['us-gaap']['Assets']

# # get company concept data
# companyConcept = requests.get(
#     (
#     f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}'
#      f'/us-gaap/Assets.json'
#     ),
#     headers=headers
#     )

# # review data
# companyConcept.json().keys()
# companyConcept.json()['units']
# companyConcept.json()['units'].keys()
# companyConcept.json()['units']['USD']
# companyConcept.json()['units']['USD'][0]

# # parse assets from single filing
# companyConcept.json()['units']['USD'][0]['val']

# # get all filings data 
# assetsData = pd.DataFrame.from_dict((
#                companyConcept.json()['units']['USD']))

# # review data
# assetsData.columns
# assetsData.form

# # get assets from 10Q forms and reset index
# assets10Q = assetsData[assetsData.form == '10-Q']
# assets10Q = assets10Q.reset_index(drop=True)

# # plot 
# assets10Q.plot(x='end', y='val')