In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import unicodedata

year = 2023
# Paths
input_dir = "Data/sp500_10k_links"
output_dir = "Data/sp500_10k_items"
os.makedirs(output_dir, exist_ok=True)

HEADERS = {"User-Agent": "Safa Berber berbersafa123@gmail.com"}

# Sections to extract
item_sections = [
    ("ITEM 1. BUSINESS", "ITEM 1A. RISK FACTORS", "Item_1"),
    ("ITEM 1A. RISK FACTORS", "ITEM 1B. UNRESOLVED STAFF COMMENTS", "Item_1A"),
    ("ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS",
     "ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK", "Item_7"),
    ("ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA",
     "ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNTANTS ON ACCOUNTING AND FINANCIAL DISCLOSURE", "Item_8"),
]

def normalize_title(title):
    # Normalize Unicode, replace special characters, and collapse whitespace
    title = unicodedata.normalize("NFKD", title)
    title = title.replace("\xa0", " ").replace("’", "'")
    title = re.sub(r'\s+', ' ', title).strip().upper()
    title = re.sub(r"[‐‑‒–—−]", ".", title)  # All dash variants
    
    # Remove unwanted punctuation (keeping word characters, spaces, and apostrophes)
    title = re.sub(r"[^\w\s']", '', title)
    # Remove trailing extraneous words like "NONE", "NOT APPLICABLE", or "TABLE OF CONTENTS"
    title = re.sub(r'\s+NONE$', '', title)
    title = re.sub(r'\s+NOT APPLICABLE$', '', title)
    title = re.sub(r'\s+TABLE OF CONTENTS$', '', title)
    # Remove any parenthetical text (e.g. details about tabular amounts)
    title = re.sub(r'\(.*?\)', '', title).strip()
    title = title.rstrip(".:,;")
    
    # Replace plurals with singular forms for consistency
    replacements = {
        "DISCLOSURES": "DISCLOSURE", "STATEMENTS": "STATEMENT", "MATTERS": "MATTER",
        "OPERATIONS": "OPERATION", "ESTIMATES": "ESTIMATE", "RESULTS": "RESULT",
        "PROCEEDINGS": "PROCEEDING", "CONDITIONS": "CONDITION", "RISKS": "RISK",
        "FACTORS": "FACTOR", "CONTROLS": "CONTROL", "PROPERTIES": "PROPERTY",
        "RELATIONSHIPS": "RELATIONSHIP", "TRANSACTIONS": "TRANSACTION",
        "SERVICES": "SERVICE", "DISCUSSIONS": "DISCUSSION", "EXHIBITS": "EXHIBIT",
        "SCHEDULES": "SCHEDULE", "COMMENTS": "COMMENT", "ITEMS": "ITEM",
    }
    for plural, singular in replacements.items():
        title = re.sub(rf'\b{plural}\b', singular, title)
        
    # --- Item 7 Change Only ---
    if title.startswith("ITEM 7") and "DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION" in title:
        title = "ITEM 7 MANAGEMENTS DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS"
    # --- End of Item 7 Change ---
    
    # For headers like "ITEM 1 AND 2. BUSINESS AND PROPERTIES"
    if title.startswith("ITEM 1 AND 2") and "BUSINESS" in title:
        title = "ITEM 1 BUSINESS"
        
    # For headers like "ITEM 1. BUSINESS GENERAL" or "ITEM 1. BUSINESS OVERVIEW"
    if title.startswith("ITEM 1 BUSINESS"):
        title = "ITEM 1 BUSINESS"
        
    return title

def extract_sections(html_text, item_pairs):
    soup = BeautifulSoup(html_text, "lxml")
    # Fix: Inline tags like <u>, <b>, <strong> inside header lines
    for tag in soup.find_all(['u', 'b', 'strong', 'span', 'i', 'em']):
        if tag.string and tag.parent:
            tag.unwrap()
    text = soup.get_text(separator="\n")
    text = unicodedata.normalize("NFKD", text)
    text = text.replace("’", "'").replace("\xa0", " ")
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    # Split text into lines and filter out lines that are solely page numbers or exactly "TABLE OF CONTENTS"
    lines = text.splitlines()
    filtered_lines = []
    for line in lines:
        stripped = line.strip()
        if re.fullmatch(r'\d+', stripped):
            continue
        if stripped.upper() == "TABLE OF CONTENTS":
            continue
        filtered_lines.append(line)
        
    cutoff = int(len(filtered_lines) * 0.04)
    main_text = "\n".join(filtered_lines[cutoff:])

    # Fix headers broken across lines (e.g., "ITEM 1.\nBusiness")
    main_text = re.sub(
        r'(ITEMS?\s+\d{1,2}[A-Z]?[.\:\-]?)\s*\n\s*([^\n]{2,200})',
        r'\1 \2',
        main_text,
        flags=re.IGNORECASE
    )

    # Modified regex to capture headers flexibly including optional "and 2"
    section_pattern = re.compile(
        r'^\s*(items?\s*\d{1,2}(?:\s*and\s*2)?[a-z]?\s*[.\:\-—]?\s*[^\n]{2,150})\s*$',
        re.IGNORECASE | re.MULTILINE
    )

    section_map = {}
    for match in section_pattern.finditer(main_text):
        raw_header = match.group(1).strip()
        cleaned = normalize_title(raw_header)

        # Check next ~200 characters after header to detect empty sections
        section_start = match.start()
        peek_ahead = main_text[section_start:section_start + 200].strip()
        peek_lines = peek_ahead.splitlines()

        # Just normalize "None" or "Not Applicable" sections to valid headers (not skip them)
        if len(peek_lines) >= 2:
            second_line = peek_lines[1].strip().upper()
            second_line_cleaned = re.sub(r'[.]', '', second_line)
            second_line_cleaned = second_line_cleaned.strip()
            if second_line in {"NONE", "NOT APPLICABLE"}:
                # Still add header, but mark that the following content is empty
                section_map[cleaned] = match.start()

        if "TABLE OF CONTENTS" in cleaned:
            continue
        if 10 < len(cleaned) < 200:
            section_map[cleaned] = section_start

    sorted_sections = sorted(section_map.items(), key=lambda x: x[1])

    extracted = {}
    for start_title, end_title, col_name in item_pairs:
        start_key = normalize_title(start_title)
        end_key = normalize_title(end_title)

        start_idx = section_map.get(start_key)
        # For Item_1A: if the expected header is not found, try a fallback based on "ITEM 1A RISK FACTOR"
        if col_name == "Item_1A" and start_key not in section_map:
            for key in section_map:
                if key.startswith("ITEM 1A RISK FACTOR"):
                    start_key = key
                    break

        start_idx = section_map.get(start_key)
        end_idx = section_map.get(end_key)

        # Fallback for the end header: if an exact match for end_key isn't found, use the next header starting with end_key after start_idx
        if end_idx is None and start_idx is not None:
            for key, pos in section_map.items():
                if key.startswith(end_key) and pos > start_idx:
                    end_idx = pos
                    break

        # If no valid end index is found or it comes before start, use the next header position
        if start_idx is not None:
            if end_idx is None or end_idx <= start_idx:
                following_starts = [pos for k, pos in sorted_sections if pos > start_idx]
                end_idx = following_starts[0] if following_starts else len(main_text)

            if end_idx is not None and end_idx > start_idx:
                content = main_text[start_idx:end_idx].strip()
            else:
                content = main_text[start_idx:start_idx + 1000].strip()
                next_match = re.search(r'\n\s*ITEM\s+\d', content, re.IGNORECASE)
                if next_match:
                    content = content[:next_match.start()]
            extracted[col_name] = content
        else:
            extracted[col_name] = f"[NOT FOUND] {start_title} → {end_title}"
    return extracted

for year in range(year, year+1):
    input_csv = os.path.join(input_dir, f"filtered_10K_filings_{year}.csv")
    output_csv = os.path.join(output_dir, f"items_filtered_10K_filings_{year}.csv")

    if not os.path.exists(input_csv):
        print(f"⚠️ Skipping {year}: File not found.")
        continue

    print(f"\n📂 Processing year: {year}")
    df = pd.read_csv(input_csv)
    extracted_items = []

    for idx, row in df.iterrows():
        url = row['document_url']
        company = row.get('company', f"[Row {idx}]")
        print(f"[{idx+1}/{len(df)}] Fetching: {url}")
        try:
            resp = requests.get(url, headers=HEADERS, timeout=30)
            if resp.status_code == 200:
                result = extract_sections(resp.text, item_sections)
            else:
                result = {sec[2]: f"HTTP ERROR {resp.status_code}" for sec in item_sections}
        except Exception as e:
            result = {sec[2]: f"ERROR: {e}" for sec in item_sections}

        for col, val in result.items():
            if isinstance(val, str) and val.startswith("[NOT FOUND]"):
                print(f"  ❌ {company} - {col} missing: {val}")

        extracted_items.append(result)

    items_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(extracted_items)], axis=1)
    items_df.to_csv(output_csv, index=False)
    print(f"✅ Done! Saved: {output_csv}")