In [None]:

def extract_legal_proceedings2(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""
    # TODO: Not sure which implementation is correct
    legal_proceedings = re.search(r"Item 3. LEGAL PROCEEDINGS(.*?)Item 4", text, re.IGNORECASE|re.DOTALL).group(1)

    # legal_proceedings = re.search(r"Item 3\. Legal Proceedings(.*?)(Item 4\.|$)", text, re.DOTALL).group(1).strip()
    return legal_proceedings

def extract_signature_date2(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format)."""
    # TODO: Not sure which implementation is correct
    # signature_date = re.search(r"SIGNATURES(.*?)</html>", text, re.IGNORECASE|re.DOTALL).group(1)

    signature_date = re.search(r"Date:\s*(\d{4}-\d{2}-\d{2})", text).group(1)

    return signature_date

def extract_signatures2(text):
    """ 4. Who signed the report? If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report."""
    # TODO: Not sure which implementation is correct
    signers = re.findall(r"(?:By|Signed by):\s*([A-Za-z ]+)\n", text)
    # signers = ", ".join([s.strip() for s in signers if "audit" not in s.lower()])

    signatures = re.findall(r"SIGNATURES(.*?)</html>", text, re.IGNORECASE|re.DOTALL)
    return signatures

In [6]:
"""
Step 1: Import necessary libraries
I have moved the iterations to the very end
"""
import re
from pathlib import Path
import pandas as pd
from datetime import datetime

"""
Step 2: Prepare a function to clean the raw content, removing html tags and entities
"""
def clean_html(raw_html):
    clean_one = re.sub(r'<.*?>', ' ', raw_html)
    clean_two = re.sub(r'&\w+;', ' ', clean_one)
    clean_three = re.sub(r"&[a-z]+;", " ", clean_two)
    clean_three = re.sub(r'&#\d+;', ' ', clean_three)
    clean_four = re.sub(r'\s+', ' ', clean_three)
    return clean_four.strip()

"""
Step 3: Extract all necessary information from the cleaned text
"""
def convert_date(date_str):
    """ This is a helper function that converts date to ISO format """

    try:
        date = datetime.strptime(date_str.strip(), "%B %d, %Y").date().isoformat()
    except ValueError:
        date = re.sub(r"\s+,", ",", date_str) # e.g.: 'For the fiscal year ended December 31 , 2023'
        date = datetime.strptime(date, "%B %d, %Y").date().isoformat()
    return date


def extract_fiscal_year(text):
    """ 1. The date of the fiscal year-end (ensure it is formatted in ISO-format)

    Currently, relevant information is in the beginning of the document, e.g.:
     'For the fiscal year ended January 31, 2024, or'
     Search for the fiscal year and return it in ISO-format.

     """
    match = re.search(r"for the fiscal year ended (\w+\s\d{1,2}\s*,\s\d{4})", text, re.IGNORECASE)
    if match:
        fiscal_year_str = match.group(1)
        fiscal_year_iso = convert_date(fiscal_year_str)
        return fiscal_year_iso

    return 'N/A'


def extract_legal_proceedings(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""

    match = re.search(r"Item 3\. Legal Proceedings(.*?)(Item 4\.|$)", text, re.IGNORECASE|re.DOTALL)
    return match.group(1).strip() if match else 'N/A'


def extract_signature_date(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format).

    Find the SIGNATURES section

    """
    signature_pattern = r"SIGNATURES.*?Pursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        signature_date_match = re.search(r"(?:Date:|as of)\s*(\w+\s\d{1,2}\s*,\s*\d{4})", signature_section)
        if signature_date_match:
            signature_date = convert_date(signature_date_match.group(1))
            return signature_date
    return 'N/A'


def extract_signers(text):
    """ 4. Who signed the report?
    If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report."""

    # find the signature section
    signature_pattern = r"SIGNATURES.*?Pursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        # find all the signers, put them in a list, join them with a comma, and exclude the audit firm
        # remember to eliminate duplicates
        signers = re.findall(r"By\s*(.*?)(?:\n|$)", signature_section)
        signers = list(set(signers))
        return ", ".join(signers)
    return 'N/A'


def process_file(file):
    clean_content = clean_html(file)

    fiscal_year = extract_fiscal_year(clean_content)
    legal_proceedings = extract_legal_proceedings(clean_content)
    signature_date = extract_signature_date(clean_content)
    signers = extract_signers(clean_content)
    return {
        "fiscal_year": fiscal_year,
        "legal_proceedings": legal_proceedings,
        "signature_date": signature_date,
        "signers": signers
    }

current_directory = Path.cwd()
results = []
for html_file in current_directory.glob('*.html'):
    with html_file.open('r', encoding='utf-8') as file:
        print(f"Processing {html_file}")
        raw_content = file.read()
        result = process_file(raw_content)
        if result:
            results.append(result)

df = pd.DataFrame(results)
df.to_csv('10k_results.csv', index=False)
df.head(10)

Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/WMT_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/GOOG_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/JNJ_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/META_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/AMZN_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/KO_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/PG_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/MSFT_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/AAPL_10K.html
Processing /Users/yiningxiang/PycharmProjects/RSM317/A1/10-k html files/TSLA_10K.html


Unnamed: 0,fiscal_year,legal_proceedings,signature_date,signers
0,2024-01-31,I. SUPPLEMENTAL INFORMATION: The Company is in...,2024-01-31,"leading on price, we earn the trust of our cus..."
1,2024-12-31,26,2017-06-27,"policy, we limit the amount of credit exposure..."
2,2023-12-31,The information called for by this item is inc...,2023-08-23,"-Laws, the written charters of the Audit Commi..."
3,2024-12-31,51,2024-12-31,"making our Llama models openly available, we a..."
4,2023-12-31,18,2024-01-24,"laws of Amazon.com, Inc. (incorporated by refe..."
5,2023-12-31,28,2023-12-31,"-Laws of the Company, as amended and restated ..."
6,2024-06-30,10,2024-06-30,U.S. GAAP In accordance with the SEC's Regulat...
7,2024-06-30,36,2024-07-30,laws of Microsoft Corporation 8-K 3.2 7/3/2023...
8,2024-09-28,18,2024-11-01,: /s/ Luca Maestri Luca Maestri Senior Vice Pr...
9,2024-12-31,29,2024-12-31,taking a modular approach to the design of bat...
