In [14]:
"""
Step 1: Import necessary libraries
I have moved the iterations to the very end
"""
import re
from pathlib import Path
import pandas as pd
from datetime import datetime

"""
Step 2: Prepare a function to clean the raw content, removing html tags and entities
"""
def clean_html(raw_html, remove_tags=True):
    """ This function cleans the raw html content by removing html tags and entities
    - remove_tags: this is an optional parameter,
                if True, remove html tags, otherwise keep them
                by default, it is set to True.
    """
    if remove_tags:
        # remove HTML tags
        cleaned_html = re.sub(r'<.*?>', ' ', raw_html)
        # remove HTML entities
        cleaned_html = re.sub(r'&\w+;', ' ', cleaned_html)
        cleaned_html = re.sub(r"&[a-z]+;", " ", cleaned_html)

    else:
        cleaned_html = re.sub(r'(<br\s*/?>|</div>|</p>|</tr>|</li>|</table>|</td>)', '\n', raw_html, flags=re.IGNORECASE)

        cleaned_html = re.sub(r'<(?!/)[^>]+>', '', cleaned_html) # do not remove <\span>
        # cleaned_html = re.sub(r'<span>(?=\w)', '', cleaned_html)  # Removes <span> if directly before a word character without a leading space
        # cleaned_html = re.sub(r'(?<=\w)</span>', '', cleaned_html)  # Removes </span> if directly after a word character without a trailing space


    # replace HTML entities (&#160;）with space
    cleaned_html = re.sub(r'&#\d+;|nbsp', ' ', cleaned_html)
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html)

    # remove urls
    cleaned_url = re.sub(r"\(http[s]?://\S+\)", "", cleaned_html)
    cleaned_url = re.sub(r"http[s]?://\S+", "", cleaned_url)

    # remove multiple spaces
    cleaned_space = re.sub(r'\s+', ' ', cleaned_url)

    # fix the error that /s/ is splitted into / s / in GOOG_10-K_2021.html
    clean_spe = re.sub(r'/\s*S\s*/', '/s/', cleaned_space, flags=re.IGNORECASE)

    # fix the error: ALICE is splitted into multiple A LICE in MSFT_10-K_2021.html
    clean_spe = re.sub(r'(\b[A-Z])\s([A-Z]{2,}\b)', r'\1\2', clean_spe)

    # fix the error: can not find the signature section in MSFT_10-K_2021.html
    cleaned = re.sub(r'SIGNAT\s*URES', 'SIGNATURES', clean_spe, flags=re.IGNORECASE)

    return cleaned.strip()


"""
Step 3: Extract all necessary information from the cleaned text
"""
def convert_date(date_str):
    """ This is a helper function that converts date to ISO format """

    try:
        date = datetime.strptime(date_str.strip(), "%B %d, %Y").date().isoformat()
    except ValueError:
        date = re.sub(r"\s+,", ",", date_str) # e.g.: 'For the fiscal year ended December 31 , 2023'
        date = datetime.strptime(date, "%B %d, %Y").date().isoformat()
    return date


def extract_fiscal_year(text):
    """ 1. The date of the fiscal year-end (ensure it is formatted in ISO-format)

    Currently, relevant information is in the beginning of the document, e.g.:
     'For the fiscal year ended January 31, 2024, or'
     Search for the fiscal year and return it in ISO-format.

     """
    match = re.search(r"for the fiscal year ended (\w+\s\d{1,2}\s*,\s\d{4})", text, re.IGNORECASE)
    if match:
        fiscal_year_str = match.group(1)
        fiscal_year_iso = convert_date(fiscal_year_str)
        return fiscal_year_iso

    return 'N/A'


def extract_legal_proceedings(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""

    match = re.search(r"Item 3\. Legal Proceedings(.*?)(Item 4\.|$)", text, re.IGNORECASE|re.DOTALL)
    return match.group(1).strip() if match else 'N/A'


def extract_signature_date(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format).

    Find the SIGNATURES section

    """
    signature_pattern = r"SIGNATURES\s*Pursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        signature_date_match = re.search(r"\s*(\w+\s\d{1,2}\s*,\s*\d{4})", signature_section)
        # (?:Date:|as of | ) removed from the regex
        if signature_date_match:
            signature_date = convert_date(signature_date_match.group(1))
            return signature_date
    return 'N/A'


def extract_signers(text):
    """ 4. Who signed the report?
    If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report."""

    positions = ['Chief Executive Officer', 'Chief Financial Officer', 'Chief Accounting Officer', 'President', 'Chairman', 'Director']

    positions_str = ('Chief', 'Officer', 'President', 'Chairman', 'Director', 'Executive', 'Financial', 'Accounting', 'Manager')

    # find the signature section
    signature_pattern = r"SIGNATURES(?:</span>)?\sPursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"

    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if not signature_section_match:
        print('No signature section found')

    if signature_section_match:
        print('found')
        signature_section = signature_section_match.group(0)

        cleaned_signers = []
        # TODO: 这个方法太粗暴了，如果四字人名就完蛋，最好的方法应该是写个新的clean text
        # TODO： 确认一下到底是list还是set
        # TODO: audit firm怎么去掉，hard-coded？
        raw_signers = re.findall(r"/s/\s*([A-Z][a-zA-Z.\-]+\s[A-Z][a-zA-Z.\-]+(?:\s[A-Z][a-zA-Z.\-]+)?)(?:</span>)?", signature_section)

        # TODO: GOOG和jnj都是找不到raw_signers，而msft和meta是连section都找不到
        if not raw_signers:
            print('section found, but no signers found')

        for signer in raw_signers:
            # 清理多余空格
            signer = signer.strip()

            # 去除职位
            for position in positions_str:
                if position in signer:
                    signer = signer.replace(position, '')

            # 检查是否有重复名字，例如 "Cesar Conde Cesar" -> "Cesar Conde"
            words = signer.split()
            if len(words) > 2 and words[-1].lower() == words[0].lower():  # 如果最后一个单词与第一个单词相同
                signer = ' '.join(words[:-1])  # 去除最后一个重复单词

            signer = signer.strip()
            # 避免重复，保持顺序
            if signer.lower() not in [s.lower() for s in cleaned_signers]:
                cleaned_signers.append(signer)

        return cleaned_signers

    return []


def process_file(file):
    clean_content_one = clean_html(file)
    clean_content_two = clean_html(file, remove_tags=False)

    fiscal_year = extract_fiscal_year(clean_content_one)
    legal_proceedings = extract_legal_proceedings(clean_content_one)
    signature_date = extract_signature_date(clean_content_one)
    signers = extract_signers(clean_content_two)

    if not signers:
        signers = extract_signers(clean_content_one)
    return {
        "fiscal_year": fiscal_year,
        "legal_proceedings": legal_proceedings,
        "signature_date": signature_date,
        "signers": signers
    }

current_directory = Path.cwd()
results = []
for html_file in current_directory.glob('*.html'):
    with html_file.open('r', encoding='utf-8') as f:
        filename = html_file.name
        raw_content = f.read()
        result = process_file(raw_content)
        result['file_name'] = filename

        if result:
            results.append(result)

df = pd.DataFrame(results)
df.to_csv('10k_results.csv', index=False)
df.head(10)

found
found
section found, but no signers found
found
found
section found, but no signers found
found
No signature section found
found
found
found
found
No signature section found
found
found
found


Unnamed: 0,fiscal_year,legal_proceedings,signature_date,signers,file_name
0,2024-01-31,I. SUPPLEMENTAL INFORMATION: The Company is in...,2024-03-15,"[C. Douglas McMillon, Gregory B. Penner, John ...",WMT_10K.html
1,2024-12-31,26,2025-02-04,"[SUNDAR PICHAI, ANAT ASHKENAZI Senior, AMIE TH...",GOOG_10K.html
2,2023-12-31,The information called for by this item is inc...,2024-02-16,"[J. Duato, J. J. Wolk, R. J. Decker, D. Adamcz...",JNJ_10K.html
3,2024-12-31,51,2025-01-29,"[Susan Li, Mark Zuckerberg, Aaron Anderson, Pe...",META_10K.html
4,2023-12-31,18,2024-02-01,"[Andrew R. Jassy, Brian T. Olsavsky, Shelley L...",AMZN_10K.html
5,2023-12-31,28,2024-02-20,"[JAMES QUINCEY, JOHN MURPHY, ERIN MAY, MARK RA...",KO_10K.html
6,2024-06-30,10,2024-08-05,"[JON R. MOELLER, ANDRE SCHULTEN, MATTHEW W. JA...",PG_10K.html
7,2024-06-30,36,2024-07-30,"[ALICE L. JOLLA, SATYA NADELLA, REID HOFFMAN, ...",MSFT_10K.html
8,2024-09-28,18,2024-11-01,"[Luca Maestri, Timothy D. Cook, Chris Kondo, W...",AAPL_10K.html
9,2024-12-31,29,2025-01-29,"[Elon Musk, Vaibhav Taneja, Robyn Denholm, Ira...",TSLA_10K.html
