In [1]:
!pip install sec_api
!pip install requests beautifulsoup4 lxml

Collecting sec_api
  Downloading sec_api-1.0.32-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading sec_api-1.0.32-py3-none-any.whl (24 kB)
Installing collected packages: sec_api
Successfully installed sec_api-1.0.32


In [2]:
# Final version

import requests
import re
from bs4 import BeautifulSoup, NavigableString
from collections import defaultdict

# --- Configuration ---
# The SEC EDGAR API requires a custom User-Agent header.
# Replace 'YourAppName' and 'youremail@example.com' with your information.
HEADERS = {'User-Agent': 'YourAppName youremail@example.com'}
# CIK mapping file URL from the SEC
CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

def get_latest_10q(ticker: str) -> str:
    """
    Fetches the HTML content of the latest 10-Q filing for a given stock ticker.

    Args:
        ticker: The stock ticker symbol (e.g., 'AAPL', 'MSFT').

    Returns:
        The HTML content of the latest 10-Q filing as a string.

    Raises:
        ValueError: If the ticker is not found or no 10-Q filing is available.
        requests.exceptions.RequestException: For network-related errors.
    """
    print(f"1. Fetching CIK for ticker: {ticker}...")
    # Get the CIK mapping from the SEC
    response = requests.get(CIK_MAP_URL, headers=HEADERS)
    response.raise_for_status()
    company_data = response.json()

    # Find the CIK for the given ticker
    cik = None
    for company in company_data.values():
        if company['ticker'] == ticker.upper():
            cik = str(company['cik_str']).zfill(10)
            break

    if not cik:
        raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")

    print(f"   Found CIK: {cik}")

    # Fetch the company's submission history
    print("2. Fetching submission history from SEC EDGAR...")
    submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    response = requests.get(submissions_url, headers=HEADERS)
    response.raise_for_status()
    submissions = response.json()

    # Find the latest 10-Q filing
    latest_10q = None
    for i, form in enumerate(submissions['filings']['recent']['form']):
        if form == '10-Q':
            accession_number = submissions['filings']['recent']['accessionNumber'][i]
            primary_document = submissions['filings']['recent']['primaryDocument'][i]
            filing_date = submissions['filings']['recent']['filingDate'][i]
            latest_10q = {
                'accession_number': accession_number.replace('-', ''),
                'primary_document': primary_document,
                'date': filing_date
            }
            break

    if not latest_10q:
        raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")

    print(f"   Found latest 10-Q filed on: {latest_10q['date']}")

    # Construct the URL for the 10-Q HTML document
    filing_url = (
        f"https://www.sec.gov/Archives/edgar/data/{cik}/"
        f"{latest_10q['accession_number']}/{latest_10q['primary_document']}"
    )

    # Fetch the filing's HTML content
    print(f"3. Fetching 10-Q document from: {filing_url}")
    response = requests.get(filing_url, headers=HEADERS)
    response.raise_for_status()

    print("   Successfully fetched document.")
    return response.text

def _normalize_header_text(text: str) -> str | None:
    """Normalizes header text to a standard format (e.g., 'PART I', 'ITEM 1A')."""
    text = text.strip().upper()

    # Match "PART I" or "PART II"
    part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
    if part_match:
        return re.sub(r'\s+', ' ', part_match.group(1))

    # Match "ITEM 1", "ITEM 1A", etc.
    item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
    if item_match:
        return re.sub(r'\s+', ' ', item_match.group(1))

    return None

def _parse_html_table(table_tag: BeautifulSoup) -> str:
    """Converts a BeautifulSoup table Tag into a Markdown formatted string."""
    markdown_rows = []

    # Process all rows in the table
    for tr in table_tag.find_all('tr'):
        # Get all cells (th and td) in the row, clean up whitespace and newlines
        cells = [" ".join(cell.get_text(strip=True).split()) for cell in tr.find_all(['td', 'th'])]
        if any(cells):  # Only add rows that have some content
            markdown_rows.append(cells)

    if not markdown_rows:
        return ""

    # Convert rows to Markdown table format
    md_output = []
    # Header row
    header = markdown_rows[0]
    md_output.append("| " + " | ".join(header) + " |")
    # Separator
    md_output.append("| " + " | ".join(['---'] * len(header)) + " |")
    # Body rows
    for row in markdown_rows[1:]:
        # Pad row if it has fewer columns than the header (handles simple colspan)
        while len(row) < len(header):
            row.append("")
        # Truncate if it has more (rare)
        row = row[:len(header)]
        md_output.append("| " + " | ".join(row) + " |")

    return "\n" + "\n".join(md_output) + "\n"


def parse_10q(html_content: str) -> dict:
    """
    Parses the HTML of a 10-Q filing to extract Parts and Items.

    Args:
        html_content: The HTML content of the 10-Q filing.

    Returns:
        A dictionary structured with Parts and Items of the 10-Q report.
    """
    print("4. Parsing HTML content...")
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all potential header elements in the document
    potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])

    # Filter for valid headers, ignoring table of contents links.
    # We no longer enforce uniqueness here to handle documents where headers
    # appear in a TOC and then again in the body.
    doc_headers = []
    for header in potential_headers:
        text = header.get_text(strip=True)
        if len(text) > 100:  # Skip long paragraphs
            continue

        normalized_key = _normalize_header_text(text)
        if normalized_key:
            # Ignore headers that are part of a link (likely TOC)
            if not header.find_parent('a'):
                doc_headers.append({'tag': header, 'key': normalized_key})

    if not doc_headers:
        print("   Warning: Could not find any standard Part/Item headers.")
        return {}

    parsed_data = defaultdict(lambda: defaultdict(str))
    current_part_key = None

    for i, header_info in enumerate(doc_headers):
        current_key = header_info['key']

        if 'PART' in current_key:
            current_part_key = current_key
            continue

        if 'ITEM' in current_key:
            if not current_part_key:
                current_part_key = "PART I" # Default to Part I if an item appears first

            start_node = header_info['tag']
            end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None

            content_parts = []
            element = start_node.next_element

            while element and element != end_node:
                # If it's text, add it, but only if it's not inside a table.
                if isinstance(element, NavigableString):
                    if not element.find_parent('table'):
                        text = element.strip()
                        if text:
                            content_parts.append(text)

                # If it's a table, parse it as a whole unit, but only if it's not nested.
                elif element.name == 'table':
                    if not element.find_parent('table'):
                         table_markdown = _parse_html_table(element)
                         if table_markdown:
                            content_parts.append(table_markdown)

                element = element.next_element

            # Join content with newlines to preserve table formatting
            full_content = "\n".join(content_parts)
            # Collapse excess newlines but preserve paragraph breaks
            clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()

            # Overwrite content for the key. This ensures the version from the
            # document body (which comes later and has content) replaces any
            # entry from the table of contents.
            parsed_data[current_part_key][current_key] = clean_content

    print("   Parsing complete.")
    # Convert defaultdicts to regular dicts for cleaner output
    return {part: dict(items) for part, items in parsed_data.items()}


def print_10q_summary(parsed_data: dict, content_length: int = 500):
    """
    Prints a summary of the parsed 10-Q data.

    Args:
        parsed_data: The dictionary returned by the parse_10q function.
        content_length: The number of characters to display for each item's content.
    """
    print("\n--- 10-Q Report Summary ---")
    if not parsed_data:
        print("No data was parsed from the document.")
        return

    # Sort parts (Part I before Part II)
    sorted_parts = sorted(parsed_data.keys())

    for part in sorted_parts:
        items = parsed_data[part]
        print(f"\n====================\n{part}\n====================")
        if not items:
            print("  (No items found for this part)")
            continue

        # Sort items numerically/alphabetically (Item 1, Item 1A, Item 2)
        sorted_items = sorted(items.keys(), key=lambda x: (int(re.search(r'\d+', x).group()), x))

        for item in sorted_items:
            content = items[item]
            print(f"\n--- {item} ---")
            summary = content[:content_length].strip()
            if not summary:
                print("  (No content extracted for this item)")
            else:
                print(f"{summary}...")
    print("\n--- End of Summary ---")


if __name__ == "__main__":
    # --- Main Execution ---
    # Specify the ticker you want to analyze
    target_ticker = 'GOOGL' # You can change this to 'MSFT', 'GOOGL', etc.

    try:
        # Step 1: Get the latest 10-Q filing's HTML
        html = get_latest_10q(target_ticker)

        # Step 2: Parse the HTML into a structured dictionary
        report_data = parse_10q(html)

        # Step 3: Print a summary of the parsed content
        print_10q_summary(report_data)

    except ValueError as e:
        print(f"\nError: {e}")
    except requests.exceptions.RequestException as e:
        print(f"\nNetwork Error: Failed to fetch data from the SEC. Details: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

1. Fetching CIK for ticker: GOOGL...
   Found CIK: 0001652044
2. Fetching submission history from SEC EDGAR...
   Found latest 10-Q filed on: 2025-07-24
3. Fetching 10-Q document from: https://www.sec.gov/Archives/edgar/data/0001652044/000165204425000062/goog-20250630.htm
   Successfully fetched document.
4. Parsing HTML content...
   Parsing complete.

--- 10-Q Report Summary ---

PART I

--- ITEM 1 ---
ITEM 1.
FINANCIAL STATEMENTS
Alphabet Inc.
CONSOLIDATED BALANCE SHEETS
(in millions, except par value per share amounts)

|  | As ofDecember 31, 2024 |  | As ofJune 30, 2025 |
| --- | --- | --- | --- |
|  |  |  | (unaudited) |
| Assets |  |  |  |
| Current assets: |  |  |  |
| Cash and cash equivalents | $ | 23,466 |  |
| Marketable securities | 72,191 |  |  |
| Total cash, cash equivalents, and marketable securities | 95,657 |  |  |
| Accounts receivable, net | 52,340 |  |  |
| Other current as...

--- ITEM 2 ---
ITEM 2.
MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND 

In [4]:
report_data['PART II']['ITEM 2']

"ITEM 2.\nUNREGISTERED SALES OF EQUITY SECURITIES AND USE OF PROCEEDS\nIssuer Purchases of Equity Securities\nThe following table presents information with respect to Alphabet's repurchases of Class A and Class C stock during the quarter ended June\xa030, 2025.\n\n| Period |  | Total Number of Class A Shares Purchased(in thousands)(1) |  | Total Number of Class C Shares Purchased(in thousands)(1) |  | Average Price Paid per Class A Share(2) |  | Average Price Paid per Class C Share(2) |  | Total Number of Shares Purchased as Part of Publicly Announced Programs(in thousands)(1) |  | Approximate Dollar Value of Shares that May Yet Be Purchased Under the Program(in millions) |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| April 1 - 30 |  | 8,471 |  |  | 24,689 |  |  | $ | 155.60 |  |  | $ |\n| May 1 - 31 |  | 4,529 |  |  | 22,447 |  |  | $ | 164.95 |  |  | $ |\n| June 1 - 30 |  | 2,710 |  |  | 17,853 |  |  | $ | 174.18 |  |  | $ |\n| Total |  | 15,710