In [4]:
import re
import json
import pandas as pd
from bs4 import BeautifulSoup

def is_date_or_number(val: str) -> bool:
    return bool(re.search(r'(\d{4}([./\-년\s])|전기|당기|월|\d{1,3}(,\d{3})*)', val))

def clean_table(df):
    df = df.replace('', pd.NA).dropna(how='all').dropna(axis=1, how='all')
    df = df.astype(str)
    return df

def process_equity_table(df):
    first_row = df.iloc[0].tolist()
    first_col = df.iloc[:, 0].tolist()
    row_has_numeric = any(is_date_or_number(x) for x in first_row)
    col_has_numeric = any(is_date_or_number(x) for x in first_col)

    if col_has_numeric and not row_has_numeric:
        df = df.transpose().reset_index(drop=True)

    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)
    df.insert(0, '과목', df.iloc[:, 0])
    df = df.drop(df.columns[1], axis=1)
    return df

def parse_equity_table_with_meta(html_path: str, output_excel: str, output_json: str):
    with open(html_path, 'r', encoding='cp949') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')

    tables = soup.find_all('table')
    meta_df = None
    data_df = None
    found_equity_title = False

    for idx, table in enumerate(tables):
        rows = table.find_all('tr')
        table_data = [
            [cell.get_text(strip=True).replace('\xa0', '') for cell in row.find_all(['td', 'th'])]
            for row in rows
        ]

        df = pd.DataFrame(table_data)
        if df.empty:
            continue

        if '자 본 변 동 표' in df.to_string():
            meta_df = df
            found_equity_title = True
            continue

        if found_equity_title:
            df = clean_table(df)
            data_df = process_equity_table(df)
            break

    if meta_df is not None and data_df is not None:
        with pd.ExcelWriter(output_excel) as writer:
            meta_df.to_excel(writer, sheet_name="표정보", index=False, header=False)
            data_df.to_excel(writer, sheet_name="자본변동표", index=False)

        json_result = {
            "meta": meta_df.values.tolist(),
            "headers": data_df.columns.tolist(),
            "data": data_df.values.tolist()
        }

        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(json_result, f, ensure_ascii=False, indent=2)

        print(f"✅ 저장 완료: {output_excel}, {output_json}")
    else:
        print("❌ 자본변동표를 찾지 못했습니다.")

# ✅ 사용 예시
parse_equity_table_with_meta(
    'samsung/감사보고서_4496517.htm',
    '자본변동표_삼성전자_2014.xlsx',
    '자본변동표_삼성전자_2014.json'
)

✅ 저장 완료: 자본변동표_삼성전자_2014.xlsx, 자본변동표_삼성전자_2014.json
