In [15]:
from bs4 import BeautifulSoup
import re
from pathlib import Path
import pandas as pd
from datetime import datetime

from bs4 import BeautifulSoup
import re

def clean_html(raw_html, remove_tags=True):
    """ This function cleans the raw HTML content using BeautifulSoup.

    - If `remove_tags` is True, removes all HTML tags.
    - If `remove_tags` is False, preserves structural tags like <br>, <div>, <p>, etc.
    """
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Remove URLs using BeautifulSoup
    for a_tag in soup.find_all('a', href=True):
        a_tag.decompose()  # Removes the entire anchor tag

    # Decode HTML entities (e.g., &nbsp; → space, &amp; → &)
    cleaned_html = soup.get_text(separator=' ')

    if not remove_tags:
        # Preserve some structural elements
        for tag in soup.find_all(['br', 'div', 'p', 'span', 'li', 'table']):
            tag.insert_after('\n')
        cleaned_html = soup.get_text()

    # Remove extra whitespace
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html)

    # Fix /s/ splitting issue
    cleaned_html = re.sub(r'/\s*S\s*/', '/s/', cleaned_html, flags=re.IGNORECASE)

    # Fix incorrect splitting of uppercase words (e.g., "A LICE" → "ALICE")
    cleaned_html = re.sub(r'(\b[A-Z])\s([A-Z]{2,}\b)', r'\1\2', cleaned_html)

    # Fix signature section issue
    cleaned_html = re.sub(r'SIGNAT\s*URES', 'SIGNATURES', cleaned_html, flags=re.IGNORECASE)

    return cleaned_html.strip()


# The rest of the code remains the same
def convert_date(date_str):
    """ This is a helper function that converts date to ISO format """
    try:
        date = datetime.strptime(date_str.strip(), "%B %d, %Y").date().isoformat()
    except ValueError:
        date = re.sub(r"\s+,", ",", date_str) # e.g.: 'For the fiscal year ended December 31 , 2023'
        date = datetime.strptime(date, "%B %d, %Y").date().isoformat()
    return date

def find_signature_pattern(text):
    """ This is a helper function that is used to find the SIGNATURES sections"""
    signature_pattern = r"SIGNATURES\s*Pursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        return signature_section

    return ''

def extract_fiscal_year(text):
    """ 1. The date of the fiscal year-end (ensure it is formatted in ISO-format)
    Currently, relevant information is in the beginning of the document, e.g.:
     'For the fiscal year ended January 31, 2024, or'
     Search for the fiscal year and return it in ISO-format.
     """
    match = re.search(r"for the fiscal year ended (\w+\s\d{1,2}\s*,\s\d{4})", text, re.IGNORECASE)
    if match:
        fiscal_year_str = match.group(1)
        fiscal_year_iso = convert_date(fiscal_year_str)
        return fiscal_year_iso

    return 'N/A'

def extract_legal_proceedings(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""
    match = re.search(r"Item 3\. Legal Proceedings(.*?)(Item 4\.|$)", text, re.IGNORECASE|re.DOTALL)
    return match.group(1).strip() if match else 'N/A'

def extract_signature_date(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format)."""
    # find the signature section
    signature_section = find_signature_pattern(text)
    if signature_section:
        signature_date_match = re.search(r"\s*(\w+\s\d{1,2}\s*,\s*\d{4})", signature_section)
        if signature_date_match:
            signature_date = convert_date(signature_date_match.group(1))
            return signature_date
    return 'N/A'

def extract_signers(text):
    """ 4. Who signed the report?
    If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report.
    """
    # find the signature section
    signature_section = find_signature_pattern(text)
    if signature_section:
        cleaned_signers = []
        raw_signers = re.findall(r"/s/\s*([A-Z][a-zA-Z.\-]+\s[A-Z][a-zA-Z.\-]+(?:\s[A-Z][a-zA-Z.\-]+)?)", signature_section)
        for signer in raw_signers:
            signer = signer.strip()
            words = signer.split()
            # remove the html tag I initially kept
            if words[-1] == '</td>' or words[-1] == '</tr>':
                signer = ' '.join(words[:-2])
            # avoid duplicate occurrences of the same signer
            if signer.lower() not in [s.lower() for s in cleaned_signers]:
                cleaned_signers.append(signer)
        return cleaned_signers
    return []

def process_file(file):
    """ This function processes the raw content of a file and extracts the necessary information """
    clean_content_one = clean_html(file)
    clean_content_two = clean_html(file, remove_tags=False)

    fiscal_year = extract_fiscal_year(clean_content_one)
    legal_proceedings = extract_legal_proceedings(clean_content_one)
    signature_date = extract_signature_date(clean_content_one)
    signers = extract_signers(clean_content_two)

    if not signers:
        signers = extract_signers(clean_content_one)
    return {
        "fiscal_year": fiscal_year,
        "legal_proceedings": legal_proceedings,
        "signature_date": signature_date,
        "signers": signers
    }

current_directory = Path.cwd()
results = []
for html_file in current_directory.glob('*.html'):
    with html_file.open('r', encoding='utf-8') as f:
        filename = html_file.name
        raw_content = f.read()
        result = process_file(raw_content)
        result['file_name'] = filename

        if result:
            results.append(result)

df = pd.DataFrame(results)
df.to_csv('10k_results.csv', index=False)
df.head(10)

Unnamed: 0,fiscal_year,legal_proceedings,signature_date,signers,file_name
0,2024-01-31,I. SUPPLEMENTAL INFORMATION: The Company is in...,2024-03-15,"[C. Douglas McMillon, Gregory B. Penner, John ...",WMT_10K.html
1,2024-12-31,For a description of our material pending lega...,2025-02-04,"[SUNDAR PICHAI Sundar, SUNDAR PICHAI Chief, AN...",GOOG_10K.html
2,2023-12-31,The information called for by this item is inc...,2024-02-16,"[J. Duato J., J. Duato Chairman, J. J. Wolk, R...",JNJ_10K.html
3,2024-12-31,As a multinational company with a complex and ...,2025-01-29,"[Susan Li Susan, Mark Zuckerberg Chairman, Sus...",META_10K.html
4,2023-12-31,"See Item 8 of Part II, “Financial Statements a...",2024-02-01,"[Andrew R. Jassy, Brian T. Olsavsky, Shelley L...",AMZN_10K.html
5,2023-12-31,” of this report. Increased or new indirect ta...,2024-02-20,"[JAMES QUINCEY James, JAMES QUINCEY, JOHN MURP...",KO_10K.html
6,2024-06-30,". The Company is subject, from time to time, t...",2024-08-05,"[JON R. MOELLER, ANDRE SCHULTEN, MATTHEW W. JA...",PG_10K.html
7,2024-06-30,Refer to Note 15 – Contingencies of the Notes ...,2024-07-30,"[ALICE L. JOLLA, SATYA NADELLA Chairman, REID ...",MSFT_10K.html
8,2024-09-28,Digital Markets Act Investigations On March 25...,2024-11-01,"[Luca Maestri Luca, Timothy D. Cook, Luca Maes...",AAPL_10K.html
9,2024-12-31,For a description of our material pending lega...,2025-01-29,"[Elon Musk Elon, Elon Musk Chief, Vaibhav Tane...",TSLA_10K.html
