In [None]:

def extract_legal_proceedings2(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""
    # TODO: Not sure which implementation is correct
    legal_proceedings = re.search(r"Item 3. LEGAL PROCEEDINGS(.*?)Item 4", text, re.IGNORECASE|re.DOTALL).group(1)

    # legal_proceedings = re.search(r"Item 3\. Legal Proceedings(.*?)(Item 4\.|$)", text, re.DOTALL).group(1).strip()
    return legal_proceedings

def extract_signature_date2(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format)."""
    # TODO: Not sure which implementation is correct
    # signature_date = re.search(r"SIGNATURES(.*?)</html>", text, re.IGNORECASE|re.DOTALL).group(1)

    signature_date = re.search(r"Date:\s*(\d{4}-\d{2}-\d{2})", text).group(1)

    return signature_date

def extract_signatures2(text):
    """ 4. Who signed the report? If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report."""
    # TODO: Not sure which implementation is correct
    signers = re.findall(r"(?:By|Signed by):\s*([A-Za-z ]+)\n", text)
    # signers = ", ".join([s.strip() for s in signers if "audit" not in s.lower()])

    signatures = re.findall(r"SIGNATURES(.*?)</html>", text, re.IGNORECASE|re.DOTALL)
    return signatures

In [33]:
"""
Step 1: Import necessary libraries
I have moved the iterations to the very end
"""
import re
from pathlib import Path
import pandas as pd
from datetime import datetime

"""
Step 2: Prepare a function to clean the raw content, removing html tags and entities
"""
def clean_html(raw_html):
    # remove HTML tags
    clean_one = re.sub(r'<.*?>', ' ', raw_html)

    # remove HTML entities
    clean_two = re.sub(r'&\w+;', ' ', clean_one)
    clean_three = re.sub(r"&[a-z]+;", " ", clean_two)

    # replace HTML entities (&#160;）with space
    clean_three = re.sub(r'&#\d+;|nbsp', ' ', clean_three)
    clean_four = re.sub(r'\s+', ' ', clean_three)

    # Fix the error: ALICE is splitted into multiple A LICE in MSFT_10-K_2021.html

    clean = re.sub(r'(\b[A-Z])\s([A-Z]{2,}\b)', r'\1\2', clean_four)
    # Fix the error: can not find the signature section in MSFT_10-K_2021.html
    text = re.sub(r'SIGNAT\s*URES', 'SIGNATURES', clean, flags=re.IGNORECASE)
    return text.strip()


"""
Step 3: Extract all necessary information from the cleaned text
"""
def convert_date(date_str):
    """ This is a helper function that converts date to ISO format """

    try:
        date = datetime.strptime(date_str.strip(), "%B %d, %Y").date().isoformat()
    except ValueError:
        date = re.sub(r"\s+,", ",", date_str) # e.g.: 'For the fiscal year ended December 31 , 2023'
        date = datetime.strptime(date, "%B %d, %Y").date().isoformat()
    return date


def extract_fiscal_year(text):
    """ 1. The date of the fiscal year-end (ensure it is formatted in ISO-format)

    Currently, relevant information is in the beginning of the document, e.g.:
     'For the fiscal year ended January 31, 2024, or'
     Search for the fiscal year and return it in ISO-format.

     """
    match = re.search(r"for the fiscal year ended (\w+\s\d{1,2}\s*,\s\d{4})", text, re.IGNORECASE)
    if match:
        fiscal_year_str = match.group(1)
        fiscal_year_iso = convert_date(fiscal_year_str)
        return fiscal_year_iso

    return 'N/A'


def extract_legal_proceedings(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""

    match = re.search(r"Item 3\. Legal Proceedings(.*?)(Item 4\.|$)", text, re.IGNORECASE|re.DOTALL)
    return match.group(1).strip() if match else 'N/A'


def extract_signature_date(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format).

    Find the SIGNATURES section

    """
    signature_pattern = r"SIGNATURES\sPursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        signature_date_match = re.search(r"\s*(\w+\s\d{1,2}\s*,\s*\d{4})", signature_section)
        # (?:Date:|as of | ) removed from the regex
        if signature_date_match:
            signature_date = convert_date(signature_date_match.group(1))
            return signature_date
    return 'N/A'


def extract_signers(text):
    """ 4. Who signed the report?
    If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report."""

    # find the signature section
    signature_pattern = r"SIGNATURES\sPursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        print(signature_section)
        # find all the signers, put them in a list, join them with a comma, and exclude the audit firm
        # remember to eliminate duplicates

        # TODO: 这个方法太粗暴了，如果四字人名就完蛋，最好的方法应该是写个新的clean text
        # TODO： 确认一下到底是list还是set
        # TODO: audit firm怎么去掉，hard-coded？
        raw_signers = re.findall(r"/s/\s*([A-Z][a-zA-Z.\-]+\s[A-Z][a-zA-Z.\-]+(?:\s[A-Z][a-zA-Z.\-]+)?)", signature_section)

        cleaned_signers = []

        for signer in raw_signers:
            # 清理多余空格
            signer = signer.strip()

            # 检查是否有重复名字，例如 "Cesar Conde Cesar" -> "Cesar Conde"
            words = signer.split()
            if len(words) > 2 and words[-1] == words[0]:  # 如果最后一个单词与第一个单词相同
                signer = ' '.join(words[:-1])  # 去除最后一个重复单词

            # 避免重复，保持顺序
            if signer not in cleaned_signers:
                cleaned_signers.append(signer)

        return cleaned_signers

    return []


def process_file(file):
    clean_content = clean_html(file)

    fiscal_year = extract_fiscal_year(clean_content)
    legal_proceedings = extract_legal_proceedings(clean_content)
    signature_date = extract_signature_date(clean_content)
    signers = extract_signers(clean_content)
    return {
        "fiscal_year": fiscal_year,
        "legal_proceedings": legal_proceedings,
        "signature_date": signature_date,
        "signers": signers
    }

current_directory = Path.cwd()
results = []
for html_file in current_directory.glob('*.html'):
    with html_file.open('r', encoding='utf-8') as file:
        print(f"Processing {html_file}")
        raw_content = file.read()
        result = process_file(raw_content)
        if result:
            results.append(result)

df = pd.DataFrame(results)
df.to_csv('10k_results.csv', index=False)
df.head(10)

Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/WMT_10K.html
SIGNATURES Pursuant to the requirements of Section 13 or 15(d) of the Securities Exchange Act of 1934, the registrant has duly caused this report to be signed on its behalf by the undersigned, thereunto duly authorized. Walmart Inc. Date: March 15, 2024 By /s/ C. Douglas McMillon C. Douglas McMillon President and Chief Executive Officer Pursuant to the requirements of the Securities Exchange Act of 1934, this report has been signed below by the following persons on behalf of the registrant and in the capacities and on the dates indicated: Date: March 15, 2024 By /s/ C. Douglas McMillon C. Douglas McMillon President and Chief Executive Officer and Director (Principal Executive Officer) Date: March 15, 2024 By /s/ Gregory B. Penner Gregory B. Penner Chairman of the Board and Director Date: March 15, 2024 By /s/ John David Rainey John David Rainey Executive Vice President and Chief Financial Officer (Prin

Unnamed: 0,fiscal_year,legal_proceedings,signature_date,signers
0,2024-01-31,I. SUPPLEMENTAL INFORMATION: The Company is in...,2024-03-15,"[C. Douglas McMillon, Gregory B. Penner, John ..."
1,2024-12-31,26,2025-02-04,[]
2,2023-12-31,The information called for by this item is inc...,2024-02-16,"[J. Duato, J. Duato Chairman, J. J. Wolk, R. J..."
3,2024-12-31,51,2025-01-29,"[Susan Li, Mark Zuckerberg Chairman, Susan Li ..."
4,2023-12-31,18,2024-02-01,"[Andrew R. Jassy, Brian T. Olsavsky, Shelley L..."
5,2023-12-31,28,2024-02-20,"[JAMES QUINCEY James, JAMES QUINCEY, JOHN MURP..."
6,2024-06-30,10,2024-08-05,"[JON R. MOELLER, ANDRE SCHULTEN, MATTHEW W. JA..."
7,2024-06-30,36,2024-07-30,"[ALICE L. JOLLA, SATYA NADELLA Chairman, REID ..."
8,2024-09-28,18,2024-11-01,"[Luca Maestri, Timothy D. Cook, Luca Maestri S..."
9,2024-12-31,29,2025-01-29,"[Elon Musk, Elon Musk Chief, Vaibhav Taneja Ch..."
