In [None]:
"""
Step 1: Import necessary libraries
"""
import re
from pathlib import Path
import pandas as pd
from datetime import datetime

"""
Step 2: Prepare a function to clean the raw content, removing html tags and entities
"""
def clean_html(raw_html, remove_tags=True):
    """ This function cleans the raw html content by removing html tags and entities
    - remove_tags: this is an optional parameter,
                if True, remove html tags, otherwise keep them
                by default, it is set to True.
    """
    if remove_tags:
        # remove HTML tags
        cleaned_html = re.sub(r'<.*?>', ' ', raw_html)
        # remove HTML entities
        cleaned_html = re.sub(r'&\w+;', ' ', cleaned_html)
        cleaned_html = re.sub(r"&[a-z]+;", " ", cleaned_html)

    else:
        # remove all other html tags except
        cleaned_html = re.sub(r'(<br\s*/?>|</div>|</p>|</span>|</li>|</table>)', '\n', raw_html, flags=re.IGNORECASE)
        cleaned_html = re.sub(r'<(?!/)[^>]+>', '', cleaned_html, flags=re.IGNORECASE)

    # replace HTML entities (&#160;）with space
    cleaned_html = re.sub(r'&#\d+;|nbsp', ' ', cleaned_html)
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html)

    # remove urls
    cleaned_url = re.sub(r"\(http[s]?://\S+\)", "", cleaned_html)
    cleaned_url = re.sub(r"http[s]?://\S+", "", cleaned_url)

    # remove multiple spaces
    cleaned_space = re.sub(r'\s+', ' ', cleaned_url)

    # fix the error that /s/ is splitted into / s / in GOOG_10-K_2021.html
    clean_spe = re.sub(r'/\s*S\s*/', '/s/', cleaned_space, flags=re.IGNORECASE)

    # fix the error: ALICE is splitted into multiple A LICE in MSFT_10-K_2021.html
    clean_spe = re.sub(r'(\b[A-Z])\s([A-Z]{2,}\b)', r'\1\2', clean_spe)

    # fix the error: can not find the signature section in MSFT_10-K_2021.html
    cleaned = re.sub(r'SIGNAT\s*URES', 'SIGNATURES', clean_spe, flags=re.IGNORECASE)

    return cleaned.strip()


"""
Step 3: Extract all necessary information from the cleaned text
"""
def convert_date(date_str):
    """ This is a helper function that converts date to ISO format """

    try:
        date = datetime.strptime(date_str.strip(), "%B %d, %Y").date().isoformat()
    except ValueError:
        date = re.sub(r"\s+,", ",", date_str) # e.g.: 'For the fiscal year ended December 31 , 2023'
        date = datetime.strptime(date, "%B %d, %Y").date().isoformat()
    return date


def find_signature_pattern(text):
    """ This is a helper function that is used to find the SIGNATURES sections"""
    signature_pattern = r"SIGNATURES\s*Pursuant to the requirements of Section.*?(?=EXHIBIT INDEX|$)"
    signature_section_match = re.search(signature_pattern, text, re.DOTALL | re.IGNORECASE)

    if signature_section_match:
        signature_section = signature_section_match.group(0)
        return signature_section

    return ''


def extract_fiscal_year(text):
    """ 1. The date of the fiscal year-end (ensure it is formatted in ISO-format)

    Currently, relevant information is in the beginning of the document, e.g.:
     'For the fiscal year ended January 31, 2024, or'
     Search for the fiscal year and return it in ISO-format.

     """
    match = re.search(r"for the fiscal year ended (\w+\s\d{1,2}\s*,\s\d{4})", text, re.IGNORECASE)
    if match:
        fiscal_year_str = match.group(1)
        fiscal_year_iso = convert_date(fiscal_year_str)
        return fiscal_year_iso

    return 'N/A'


def extract_legal_proceedings(text):
    """ 2. The content of “Item 3. LEGAL PROCEEDINGS”."""

        # Find all matches
    matches = list(re.finditer(r"(?:\bItem\s3\.\s*Legal Proceedings\.?\b)\b(.*?)(?=\bItem\s4\b|$)", text, re.IGNORECASE | re.DOTALL))

    # Get the second match if available
    if len(matches) > 1:
        second_match = matches[-1].group(1).strip()  # Extract the second occurrence
        return second_match
    elif len(matches) == 1:
        return matches[0].group(1).strip()
    else:
        return "N/A"
        # match = list(re.finditer(r"(?<=\bItem 3\.\sLegal Proceedings\b)(.*?)(?=\bItem 4\.\b|$)", text, re.IGNORECASE|re.DOTALL))
        # return match[0].group(1).strip() if match else 'N/A'


def extract_signature_date(text):
    """ 3. The date of signature(s) (ensure it is formatted in ISO-format)."""
    # find the signature section
    signature_section = find_signature_pattern(text)
    if signature_section:

        signature_date_match = re.search(r"\s*(\w+\s\d{1,2}\s*,\s*\d{4})", signature_section)

        if signature_date_match:
            signature_date = convert_date(signature_date_match.group(1))
            return signature_date
    return 'N/A'


def extract_signers(text):
    """ 4. Who signed the report?
    If there are multiple signatures, all of them have to be listed (comma separated).
    Do not include the audit firm – in case it is given in the report.
    """
    # find the signature section
    signature_section = find_signature_pattern(text)
    if signature_section:

        cleaned_signers = []
        raw_signers = re.findall(r"/s/\s*([A-Z][a-zA-Z.\-]+\s[A-Z][a-zA-Z.\-]+(?:\s[A-Z][a-zA-Z.\-]+)?)", signature_section)

        for signer in raw_signers:
            signer = signer.strip()

            words = signer.split()
            # remove the html tag I initially kept
            if words[-1] == '</td>' or words[-1] == '</tr>':
                    signer = ' '.join(words[:-2])
            # avoid duplicate occurrences of the same signer
            if signer.lower() not in [s.lower() for s in cleaned_signers]:
                cleaned_signers.append(signer)

        return cleaned_signers

    return []


def save_text_to_file(text, file_name):
    """ This function saves the extracted text to a file """
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(text)


def process_file(file):
    """ This function processes the raw content of a file and extracts the necessary information """
    clean_content_one = clean_html(file)
    clean_content_two = clean_html(file, remove_tags=False)

    # Save cleaned text to a .txt file
    save_text_to_file(clean_content_one, f"{filename}_cleaned.txt")

    fiscal_year = extract_fiscal_year(clean_content_one)
    legal_proceedings = extract_legal_proceedings(clean_content_one)
    signature_date = extract_signature_date(clean_content_one)
    signers = extract_signers(clean_content_two)

    if not signers:
        signers = extract_signers(clean_content_one)
    return {
        "fiscal_year": fiscal_year,
        "legal_proceedings": legal_proceedings,
        "signature_date": signature_date,
        "signers": signers
    }

"""
Step 4: Iterate over all files in the directory and process them
"""
current_directory = Path.cwd()
results = []
for html_file in current_directory.glob('*.html'):
    with html_file.open('r', encoding='utf-8') as f:
        filename = html_file.name
        raw_content = f.read()
        result = process_file(raw_content)
        result['file_name'] = filename

        if result:
            results.append(result)

df = pd.DataFrame(results)
df.to_csv('10k_results.csv', index=False)
df.head(10)