In [15]:
import gdown
import os
import json
import pandas as pd
import mysql.connector
import pymysql 
import re

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredHTMLLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pydantic import BaseModel, Field, create_model
from typing import List, Dict
from openai import OpenAI

## Load Data

In [16]:
file_path = "크래프톤_2023.htm"

loader = UnstructuredHTMLLoader(file_path)
docs = loader.load()

print(docs)



In [17]:
# 출력 구조 정의
class FieldWithReason(BaseModel):
    reason: str = Field(description="value를 도출한 근거")
    value: str = Field(description="value 값")
    
# 출력 파서
parser = PydanticOutputParser(pydantic_object=FieldWithReason)

## 전체 보고서에서 정보 추출

In [18]:
with open("report/krafton_11.json", encoding="utf-8") as f:
    data = json.load(f)

print("🔍 전체 길이:", len(data))
print(data)

🔍 전체 길이: 201
{'114000': {'label': '자산총계|비유동자산(계)'}, '113200': {'label': '자산총계|비유동자산(계)|유형자산(계)'}, '113203': {'label': '(유형자산정부보조금계)'}, '113201': {'label': '(유형자산감가상각누계액계)'}, '113205': {'label': '(유형자산손상차손누계액계)'}, '113310': {'label': '자산총계|비유동자산(계)|유형자산(계)|리스자산'}, '113311': {'label': '(유형자산감가상각누계액계)|(리스자산감가상각누계액)'}, '112690': {'label': '자산총계|비유동자산(계)|유형자산(계)|리스자산|기타리스자산'}, '112699': {'label': '(유형자산감가상각누계액계)|(리스자산감가상각누계액)|(기타리스자산감가상각누계액)'}, '113110': {'label': '자산총계|비유동자산(계)|유형자산(계)|토지'}, '113146': {'label': '자산총계|비유동자산(계)|유형자산(계)|시설장치'}, '113148': {'label': '(유형자산정부보조금계)|(시설장치정부보조금)'}, '113149': {'label': '(유형자산감가상각누계액계)|(시설장치감가상각누계액)'}, '113180': {'label': '자산총계|비유동자산(계)|유형자산(계)|비품'}, '113181': {'label': '(유형자산감가상각누계액계)|(비품감가상각누계액)'}, '113199': {'label': '자산총계|비유동자산(계)|유형자산(계)|건설중인자산'}, '113196': {'label': '(유형자산손상차손누계액계)|(건설중인자산손상차손누계액)'}, '112150': {'label': '자산총계|비유동자산(계)|투자부동산'}, '112175': {'label': '(투자부동산감가상각누계액)'}, '112143': {'label': '자산총계|비유동자산(계)|투자부동산|토지'}, '112144': {'labe

In [19]:
# 전체 출력
for label in data:
    print(label)
    
labels = data.values()
label_list = [item['label'] for item in labels if isinstance(item, dict) and 'label' in item]

114000
113200
113203
113201
113205
113310
113311
112690
112699
113110
113146
113148
113149
113180
113181
113199
113196
112150
112175
112143
112144
112178
113400
113355
113356
113299
113007
113008
113009
113010
113220
113221
113240
113241
113290
113291
113297
113346
113347
113348
113300
113349
113383
112108
112109
112113
112111
112106
112114
112121
112147
112117
112118
112119
112631
112180
112241
112242
112245
112210
112215
112211
112212
112213
112261
113604
112610
112270
112636
112251
112252
112000
111156
111249
111150
111159
111152
111147
111180
111189
111183
111190
111199
111192
111197
111170
111179
111171
111174
111591
111594
111233
111230
111232
111220
111225
111484
111162
111421
111401
111410
111419
111415
111420
111422
111430
111200
111100
111135
111130
111134
111137
115000
118060
118100
118110
118120
118211
118300
118330
118630
118600
118470
118530
118531
118533
118540
118541
118621
118090
118214
118236
118238
118430
118570
118599
118900
117000
117112
116400
116409
116805
116800

In [20]:
label_list

['자산총계|비유동자산(계)',
 '자산총계|비유동자산(계)|유형자산(계)',
 '(유형자산정부보조금계)',
 '(유형자산감가상각누계액계)',
 '(유형자산손상차손누계액계)',
 '자산총계|비유동자산(계)|유형자산(계)|리스자산',
 '(유형자산감가상각누계액계)|(리스자산감가상각누계액)',
 '자산총계|비유동자산(계)|유형자산(계)|리스자산|기타리스자산',
 '(유형자산감가상각누계액계)|(리스자산감가상각누계액)|(기타리스자산감가상각누계액)',
 '자산총계|비유동자산(계)|유형자산(계)|토지',
 '자산총계|비유동자산(계)|유형자산(계)|시설장치',
 '(유형자산정부보조금계)|(시설장치정부보조금)',
 '(유형자산감가상각누계액계)|(시설장치감가상각누계액)',
 '자산총계|비유동자산(계)|유형자산(계)|비품',
 '(유형자산감가상각누계액계)|(비품감가상각누계액)',
 '자산총계|비유동자산(계)|유형자산(계)|건설중인자산',
 '(유형자산손상차손누계액계)|(건설중인자산손상차손누계액)',
 '자산총계|비유동자산(계)|투자부동산',
 '(투자부동산감가상각누계액)',
 '자산총계|비유동자산(계)|투자부동산|토지',
 '자산총계|비유동자산(계)|투자부동산|건물',
 '(투자부동산감가상각누계액)|(건물감가상각누계액)',
 '자산총계|비유동자산(계)|무형자산(계)',
 '(상각누계액)',
 '(손상차손누계액)',
 '(정부보조금)',
 '자산총계|비유동자산(계)|무형자산(계)|기타의무형자산(계)',
 '(상각누계액)|(기타의무형자산상각누계액)',
 '(손상차손누계액)|(기타의무형자산손상차손누계액)',
 '(정부보조금)|(기타의무형자산정부보조금)',
 '자산총계|비유동자산(계)|무형자산(계)|기타의무형자산(계)|산업재산권',
 '(상각누계액)|(기타의무형자산상각누계액)|(산업재산권상각누계액)',
 '자산총계|비유동자산(계)|무형자산(계)|기타의무형자산(계)|저작권',
 '(상각누계액)|(기타의무형자산상각누계액)|(저작권상각누계액)',
 '자산총계|비유동자산(계)|무형자산(계)|

In [21]:
class FieldWithReason(BaseModel):
    reason: str
    value: str

def make_report_model(labels: list[str]) -> type[BaseModel]:
    fields = {lbl: FieldWithReason for lbl in labels}
    return create_model('ReportLabels', **fields)

ReportLabels = make_report_model(label_list)


### 프롬프팅 개선 0610

In [None]:
def chunk_labels(labels, chunk_size=100):
    for i in range(0, len(labels), chunk_size):
        yield labels[i:i + chunk_size]

all_results = []

for i, chunk in enumerate(chunk_labels(label_list)):
    print(f"{i+1}\uBC88\uC9F8 \uCCB4\uD06C \uCC98\uB9AC \uC911...")
    ReportLabelsPartial = make_report_model(chunk)

    client = OpenAI(api_key='')
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": f"""
You are a financial document analysis assistant. Your task is to extract accurate, structured data from Korean audit reports (\uAC80\uC0AC\uBCF4\uACE0\uC11C).

Use these to guide your extraction for the current year's report, while still relying on current data.
Be especially careful with:
- Unit mismatches
- Label variants
- Ending balance vs beginning balance

# CRITICAL INSTRUCTIONS
- First, check the following structured sections:
  1. \uC7AC\uBB34\uC0C1\uD654\uD45C (Balance Sheet)
  2. \uC790\uBCF8\uBCC0\uB3D9\uD45C (Statement of Changes in Equity)
- These contain **most target values explicitly**. Do not skip tables due to formatting (e.g., merged cells, multi-columns).

- Many missed values such as \"Investment Property\", \"Intangible Assets\", \"Long-term Investments\", \"Trade and Other Receivables\" are found directly in these tables.
- Subcomponents of equity (\"Ending Paid-in Capital\", \"Other Equity Items\", \"Capital Adjustment\", \"Retained Earnings\") are usually in the Statement of Changes in Equity.

# MATCHING RULES
- Do **not round** any values. Use the exact value shown (e.g., 38,942,843,687 → \uC798\uBABB, correct: 38,942,843,687).
- Respect the units (\uCC9C\uC6D0, \uBC31\uB9CC\uC6D0, \uC2ED\uC5B5\uC6D0). Misreading unit = wrong answer.
- Match semantically equivalent terms (e.g., 납입자본금 = Paid-in Capital, 자본조정 = Capital Adjustment).

- Labels may refer to columns like:
  | 자본항목        | 기초금액     | 기말금액     |
  | 납입자본금      | 1,000,000,000 | 1,200,000,000 |
  → "기말납입자본" → 1,200,000,000

- For compound labels (e.g., "기초납입자본|기말납입자본"), extract **both values**.

# CALCULATION RULES
- If a parent label has no value, but all children exist → sum them.
- If parent exists, and all but one child exist → derive missing child = parent - \uC544\uB798 \uAC12.
- Always state the **formula** in the reason. If not possible → return \"\uC54C \uC218 \uC5C6\uC74C\".

# LOOKUP PRIORITY
1. Balance Sheet
2. Statement of Changes in Equity
3. Notes (tables or inline)
4. Anywhere else with valid match (e.g., title-aligned columns)

# OTHER RULES
- If label contains directional words like "기초", "기말", match based on column name.
- Handle multi-header / merged / right-aligned columns carefully.
- Never hallucinate. No assumptions. Match only if value **clearly and explicitly** corresponds.
- Return everything in **Korean**.

# GOAL
Fix previous misses due to:
- Ignoring merged cells / skipping multi-columns
- Overlooking ending balances (기말) in equity tables
- Rounding values
- Not matching semantically equivalent labels

🧮 Unit Handling (VERY IMPORTANT):

- Every value must be returned in **Korean Won (KRW)**, and must be formatted with commas and include all digits.
- If the number in the report is given in thousands (천원), multiply it by 1,000 before returning it as value.
- If it is given in millions (백만원), multiply it by 1,000,000.
- Example:
  - If the source shows "167,899천원", the returned value must be "167,899,000원".
  - If the source shows "1,234백만원", the value must be "1,234,000,000원".

- Your `value` must always reflect the unit-converted number in **KRW**, and formatted like `1,234,567,890`.
- Do not return raw values like `167,899` without converting the unit.
- Pay very close attention to the **unit of measurement** for each table or section in the report. 
- Tables in the "Statement of Financial Position" (재무상태표) and "Statement of Changes in Equity" (자본변동표) usually include a unit indication at the top, such as:
  - 단위: 원 (KRW)
  - 단위: 천원 (thousand KRW)
  - 단위: 백만원 (million KRW)

- However, **footnotes and detailed breakdown tables** may use a **different unit** from the main financial statements. 
  For example:
  - The balance sheet might use "천원", while footnote tables use "백만원".

- Therefore, for **each value you extract**, make sure to:
  1. Identify the unit used in the table or paragraph where the value appears.
  2. Convert the number to **"원 (KRW)"**, if needed, before returning it.
     - 천원 (thousand KRW): Multiply by 1,000.
     - 백만원 (million KRW): Multiply by 1,000,000.

- Do not assume the unit based on earlier tables — **check every section independently**.
- If the unit is not explicitly written, but implied by table structure or context (e.g., very small numbers in large account fields), use reasonable judgment, and clearly state the inferred unit in your "reason".

- ⚠️ Many previous errors were caused by mixing up units. Be precise.
- Your output value must always be in **KRW (원)**, formatted with commas and no rounding.

# 🔍 UNIT DETECTION AND VALUE CONVERSION (MANDATORY)

- You must **not** return values like "167,899" if the unit is "천원".
- You must multiply:
  - "천원" → × 1,000
  - "백만원" → × 1,000,000
- Your final output MUST be:
  - Fully converted to 원 (KRW)
  - Comma-formatted (예: 167,899,000)
  - Without any rounding

⚠️ The unit may appear:
- Above the table (e.g., "단위: 백만원")
- Inside table cells or headers (e.g., "자본금(단위: 천원)")
- In footnotes or leftmost column

🔍 You MUST detect the unit from:
1. The title or header
2. Any cell in the same table
3. Footnotes or side text

❗If the unit appears inside the table (not above), you still MUST apply it.
❗You are not allowed to use or return the raw value. You must multiply based on unit.
❗If you extract "1,234천원" → final value = "1,234,000원"

💡 Return both:
- `reason`: Must state the original number and unit (e.g., "1,234천원" from table)
- `value`: Fully converted KRW string (e.g., "1,234,000원")

Let’s go step by step.

# 감사보고서 테이블 내용
{docs[0].page_content}
"""},
            {"role": "user", "content": "2023년의 값들을 구해줘."}
        ],
        temperature=0.2,
        response_format=ReportLabelsPartial,
    )

    parsed = completion.choices[0].message.parsed
    all_results.append(parsed)

1번째 체크 처리 중...
2번째 체크 처리 중...
3번째 체크 처리 중...


In [111]:
merged = {}
for result in all_results:
    merged.update(result)
    
# DataFrame으로 변환
df = pd.DataFrame.from_dict(merged, orient="index")
df.reset_index(inplace = True)
df.rename(columns={"index": "계정명"}, inplace=True)
df.columns = ["계정명", "reason", "value"]

df.to_csv("2023/parsed_11.csv", index=False, encoding = "utf-8-sig")

report 번호 + 계정코드 추가

In [112]:
df = pd.read_csv("2023/parsed_11.csv", encoding="utf-8-sig")

# 열 이름 확인 및 필요시 재정의
expected_cols = ["계정명", "reason", "value"]
if not all(col in df.columns for col in expected_cols):
    df.columns = expected_cols

# 계정명 → 계정코드 매핑
name_to_code = {}
for code, info in data.items():
    name = info.get("label")
    if isinstance(name, str):
        name_to_code[name] = code

# 계정코드 열 생성
def find_account_code(name):
    code = name_to_code.get(name)
    if code:
        return code
    return ""

df["계정코드"] = df["계정명"].map(find_account_code)

# 열 순서 재정렬
df = df[["계정코드", "계정명", "reason", "value"]]

# 저장
df.to_csv("2023/parsed_11_code.csv", index=False, encoding="utf-8-sig")

### 채점

정답

In [113]:
# 파일 경로
csv_path = "2023/parsed_11_code.csv"
excel_path = "answer/krafton_11_2023.xlsx"

# 데이터 불러오기
test_df = pd.read_csv(csv_path)
answer_df = pd.read_excel(excel_path)

def extract_number(val):
    if isinstance(val,str) and "'" in val:
        try:
            val = eval(val)
            val = val[1] if isinstance(val, tuple) and len(val) > 1 else val
        except:
            pass
    return str(val)

test_df["value"] = test_df["value"].apply(extract_number)

# 숫자 전처리 함수
def clean_number(series):
    return pd.to_numeric(
        series.astype(str)
              .str.replace("원", "")
              .str.replace(",", "")
              .str.replace("(", "-")
              .str.replace(")", ""),
        errors="coerce"
    )

# 전처리된 숫자 열 생성
test_df["clean_value"] = clean_number(test_df["value"])
answer_df["clean_answer"] = clean_number(answer_df["20231231 K 원"])

# 문자열 정제
test_df["계정명"] = test_df["계정명"].astype(str).str.strip()
test_df["계정코드"] = test_df["계정코드"].astype(str).str.strip()
answer_df["계정명"] = answer_df["계정명"].astype(str).str.strip()
answer_df["계정코드"] = answer_df["계정코드"].astype(str).str.strip()

# 매칭 수행
matched_rows = []

for _, row in test_df.iterrows():
    test_val = row["clean_value"]
    test_label = str(row["계정명"])
    test_code = str(row["계정코드"])
    
    # 숫자와 계정명 모두 일치하는 행만 필터링
    matched = answer_df[
        (answer_df["clean_answer"] == test_val) &
        (answer_df["계정명"] == test_label) &
        (answer_df["계정코드"] == test_code)
    ]
    if not matched.empty:
        for _, ans_row in matched.iterrows():
            matched_rows.append({
                "계정코드": test_code,
                "계정명": test_label,
                "예측값": int(test_val)
            })

# 결과 출력
matched_df = pd.DataFrame(matched_rows)

def count_not_null(series_or_df):
    return series_or_df.notnull().sum()

answer_df_i = count_not_null(answer_df["clean_answer"])

# 매칭률
print('*****************************')
print(f"📋 정답 계정명 개수:" , answer_df_i)
print(f"📍 매칭 계정명 개수: ", len(matched_df))
match_rate = len(matched_df) / answer_df_i
print(f"\n 🎯 매칭률: {match_rate:.2%}")
print('*****************************')

matched_df.to_csv("2023/matched_11.csv")

*****************************
📋 정답 계정명 개수: 140
📍 매칭 계정명 개수:  43

 🎯 매칭률: 30.71%
*****************************


### MySQL DB

DB 연결

In [114]:
# MySQL 연결
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='ljm03',
    database='NICE'
)
cursor = conn.cursor(dictionary=True)

# DB에서 계산식 기반 계정명 불러오기
def fetch_formula_info():
    conn = pymysql.connect(
        host = 'localhost',
        user = 'root',
        password = 'ljm03',
        db = 'NICE',
        charset = 'utf8'
    )
    sql = "SELECT account_name, account_code, formula_text, formula_code FROM krafton"
    df = pd.read_sql(sql, conn)
    conn.close()
    return df

# DB 확인
df_formula = fetch_formula_info()
print(df_formula.head())

  account_name account_code  \
0         자산총계       115000   
1     비유동자산(계)       114000   
2      유형자산(계)       113200   
3         리스자산       113310   
4         시설장치       113146   

                                        formula_text  \
0                          자산총계 = 비유동자산(계) + 유동자산(계)   
1  비유동자산(계) = 유형자산(계) + 투자부동산 + 무형자산(계) + 장기투자자산 ...   
2           유형자산(계) = 리스자산 + 토지 + 시설장치 + 비품 + 건설중인자산   
3                                      리스자산 = 기타리스자산   
4              시설장치 = 시설장치 - 시설장치감가상각누계액 - 시설장치정부보조금   

                                        formula_code  
0                           115000 = 114000 + 112000  
1  114000 = 113200 + 112150 + 113400 + 113383 + 1...  
2  113200 = 113310 + 113110 + 113146 + 113180 + 1...  
3                                    113310 = 112690  
4                  113146 = 113146 - 113149 - 113148  


  df = pd.read_sql(sql, conn)


unmatched = report 11 - matched

In [115]:
# 매칭된 계정코드 리스트
matched_codes = {
    str(code).strip()[2:] if str(code).strip().startswith("11") else str(code).strip()
    for code in matched_df["계정코드"]
}

# report 11에서 매칭되지 않은 것만 추출
unmatched_data = [
    {"계정코드": "11" + str(code).strip(), **data[code]}
    for code in data
    if str(code).strip() not in matched_codes
]

unmatched_df = pd.DataFrame(unmatched_data)
print(unmatched_df.head())

unmatched_df.to_csv("2023/unmatched_11.csv", index=False, encoding="utf-8-sig")

       계정코드                  label
0  11114000          자산총계|비유동자산(계)
1  11113200  자산총계|비유동자산(계)|유형자산(계)
2  11113203           (유형자산정부보조금계)
3  11113201         (유형자산감가상각누계액계)
4  11113205         (유형자산손상차손누계액계)


DB 반영으로 unmatched에서 계산 가능한 것 추출

In [116]:
def interpolate_values(unmatched_df, matched_df, df_formula):
    import re
    import itertools

    # 데이터 정제
    matched_df["계정코드"] = matched_df["계정코드"].astype(str).str.strip()
    unmatched_df["계정코드"] = unmatched_df["계정코드"].astype(str).str.strip()
    df_formula["account_code"] = df_formula["account_code"].astype(str).str.strip()
    df_formula["formula_code"] = df_formula["formula_code"].astype(str)

    # 계정코드 → 예측값 매핑
    matched_value_map = dict(zip(matched_df["계정코드"], matched_df["예측값"]))

    # 계정코드 → 계정명 매핑 
    name_map = dict(zip(unmatched_df["계정코드"], unmatched_df["label"]))

    # 계산식 딕셔너리 구성
    formula_dict = {}
    reverse_formula_dict = {}

    for _, row in df_formula.iterrows():
        lhs_match = re.match(r"^\s*(\d{6})\s*=", row["formula_code"])
        rhs_matches = re.findall(r"\d{6}", row["formula_code"])

        if lhs_match and len(rhs_matches) > 1:
            lhs = lhs_match.group(1)
            rhs = [code for code in rhs_matches if code != lhs]
            formula_dict[lhs] = rhs

            for missing_code in rhs:
                others = [c for c in rhs if c != missing_code]
                reverse_formula_dict.setdefault(missing_code, []).append((lhs, others))

    # 보간 수행
    interpolated_rows = []

    # 정방향 보간
    for _, row in unmatched_df.iterrows():
        target = row["계정코드"]
        if target in formula_dict:
            rhs_codes = formula_dict[target]
            available = [code for code in rhs_codes if code in matched_value_map]

            found = False
            for r in range(len(available), 0, -1):
                for combo in itertools.combinations(available, r):
                    values = [matched_value_map[c] for c in combo]
                    estimated = sum(values)
                    interpolated_rows.append({
                        "계정코드": target,
                        "계정명": name_map.get(target, ""), 
                        "보간값": estimated,
                        "보간방향": f"정방향({' + '.join(combo)})"
                    })
                    found = True
                    break
                if found:
                    break

    # 기존 역방향 보간 
    for _, row in unmatched_df.iterrows():
        target = row["계정코드"]
        if target in reverse_formula_dict:
            for lhs_code, siblings in reverse_formula_dict[target]:
                if lhs_code in matched_value_map and all(code in matched_value_map for code in siblings):
                    estimated = matched_value_map[lhs_code] - sum(matched_value_map[code] for code in siblings)
                    interpolated_rows.append({
                        "계정코드": target,
                        "계정명": name_map.get(target, ""),  
                        "보간값": estimated,
                        "보간방향": "역방향"
                    })
                    break

    return pd.DataFrame(interpolated_rows)

# 최초 입력 준비
all_matched_df = matched_df.copy()
remaining_unmatched_df = unmatched_df.copy()

# 최대 3회 반복 보간
for round_num in range(3):
    print(f"🔁 {round_num + 1}차 보간 중...")

    interpolated_df = interpolate_values(remaining_unmatched_df, all_matched_df, df_formula)

    if interpolated_df.empty:
        print("더 이상 보간할 항목이 없습니다.")
        break

    interpolated_df = interpolated_df.rename(columns={"보간값": "예측값"})
    interpolated_df = interpolated_df[["계정코드", "계정명", "예측값"]]
    interpolated_df = interpolated_df[~interpolated_df["계정코드"].isin(all_matched_df["계정코드"])]

    all_matched_df = pd.concat([all_matched_df, interpolated_df], ignore_index=True)

    remaining_unmatched_df = remaining_unmatched_df[~remaining_unmatched_df["계정코드"].isin(all_matched_df["계정코드"])]

# 결과 저장
all_matched_df.to_csv("2023/interpolated_11.csv", index=False, encoding="utf-8-sig")
print("최종 보간 결과 저장 완료.")

🔁 1차 보간 중...
더 이상 보간할 항목이 없습니다.
최종 보간 결과 저장 완료.


matched + interpolated

In [117]:
# 파일 로드
matched_df = pd.read_csv("2023/matched_11.csv", dtype={"계정코드": str})
interpolated_df = pd.read_csv("2023/interpolated_11.csv", dtype={"계정코드": str})

with open("report/krafton_11.json", encoding="utf-8") as f:
    parsed_data = json.load(f)

# nterpolated 결과를 matched에 맞춰 변환
interpolated_df = interpolated_df.rename(columns={"보간값": "예측값"})
interpolated_df = interpolated_df[["계정코드", "계정명", "예측값"]]

# 중복 제거: interpolated에서 matched와 겹치는 계정코드는 제외
interpolated_df = interpolated_df[~interpolated_df["계정코드"].isin(matched_df["계정코드"])]

# 병합
combined_df = pd.concat([matched_df, interpolated_df], ignore_index=True)

# 정렬을 위한 기준 계정코드 리스트 생성 (parsed_11의 key에 "11" 붙이기)
sorted_codes = ["11" + str(code).zfill(4) for code in parsed_data.keys()]

# 순서대로 정렬
combined_df["정렬기준"] = combined_df["계정코드"].apply(lambda x: sorted_codes.index(x) if x in sorted_codes else float("inf"))
combined_df = combined_df.sort_values("정렬기준").drop(columns=["정렬기준"])

combined_df = combined_df.loc[:, ~combined_df.columns.str.contains("^Unnamed")]

# 결과 저장 또는 출력
combined_df.to_csv("2023/results_11.csv", index=False, encoding="utf-8-sig")
print(combined_df.head())

      계정코드                                     계정명            예측값
0   114000                           자산총계|비유동자산(계)  2637208558662
23  118900                            부채와자본총계|자본총계  5592036287486
24  117000                   부채와자본총계|부채총계|비유동부채(계)   253912763868
25  116400  부채와자본총계|부채총계|비유동부채(계)|매입채무및기타채무|장기미지급금      184709000
26  116805     부채와자본총계|부채총계|비유동부채(계)|매입채무및기타채무|보증금     8816039000


다시 채점 ~

In [118]:
# 파일 경로
csv_path = "2023/results_11.csv"
excel_path = "answer/krafton_11_2023.xlsx"

# 데이터 불러오기
test_df = pd.read_csv(csv_path)
answer_df = pd.read_excel(excel_path)

def extract_number(val):
    if isinstance(val,str) and "'" in val:
        try:
            val = eval(val)
            val = val[1] if isinstance(val, tuple) and len(val) > 1 else val
        except:
            pass
    return str(val)

test_df["예측값"] = test_df["예측값"].apply(extract_number)

# 숫자 전처리 함수
def clean_number(series):
    return pd.to_numeric(
        series.astype(str)
              .str.replace("원", "")
              .str.replace(",", "")
              .str.replace("(", "-")
              .str.replace(")", ""),
        errors="coerce"
    )

# 전처리된 숫자 열 생성
test_df["clean_value"] = clean_number(test_df["예측값"]) / 1000
answer_df["clean_answer"] = clean_number(answer_df["20231231 K 원"]) / 1000

# 문자열 정제
test_df["계정명"] = test_df["계정명"].astype(str).str.strip()
test_df["계정코드"] = test_df["계정코드"].astype(str).str.strip()
answer_df["계정명"] = answer_df["계정명"].astype(str).str.strip()
answer_df["계정코드"] = answer_df["계정코드"].astype(str).str.strip()

# 매칭 수행
matched_rows = []

for _, row in test_df.iterrows():
    test_val = row["clean_value"]
    test_label = str(row["계정명"])
    test_code = str(row["계정코드"])
    
    # 숫자와 계정명 모두 일치하는 행만 필터링
    matched = answer_df[
        (answer_df["계정명"] == test_label) &
        (answer_df["계정코드"] == test_code) &
        (answer_df["clean_answer"].sub(test_val).abs() <= 10)
    ]
    if not matched.empty:
        for _, ans_row in matched.iterrows():
            matched_rows.append({
                "계정코드": test_code,
                "계정명": test_label,
                "예측값": round(test_val, 0),
                "정답값": round(ans_row["clean_answer"], 0)
            })
            
matched_df["match_key"] = (
    matched_df["계정코드"] + "|" +
    matched_df["계정명"] + "|" +
    matched_df["예측값"].astype(int).astype(str)
)

answer_df["match_key"] = (
    answer_df["계정코드"] + "|" +
    answer_df["계정명"] + "|" +
    answer_df["clean_answer"].round().astype(int).astype(str)
)

# 결과 출력
matched_df = pd.DataFrame(matched_rows)

def count_not_null(series_or_df):
    return series_or_df.notnull().sum()

answer_df_i = count_not_null(answer_df["clean_answer"])

# 매칭률
print('*****************************')
print(f"📋 정답 계정명 개수:" , answer_df_i)
print(f"📍 매칭 계정명 개수: ", len(matched_df))
match_rate = len(matched_df) / answer_df_i
print(f"\n 🎯 매칭률: {match_rate:.2%}")
print('*****************************')

*****************************
📋 정답 계정명 개수: 140
📍 매칭 계정명 개수:  43

 🎯 매칭률: 30.71%
*****************************


In [119]:
# 먼저 key 컬럼을 만들어 두 DataFrame 간 비교가 용이하게 구성
test_df["match_key"] = test_df["계정코드"] + "|" + test_df["계정명"] + "|" + test_df["clean_value"].round().astype(int).astype(str)
answer_df["match_key"] = answer_df["계정코드"] + "|" + answer_df["계정명"] + "|" + answer_df["clean_answer"].round().astype(int).astype(str)

# matched_df도 동일한 key 생성
matched_df["match_key"] = matched_df["계정코드"] + "|" + matched_df["계정명"] + "|" + matched_df["예측값"].astype(int).astype(str)

# 정답 중에서 매칭되지 않은 key를 필터링
unmatched_answer_df = answer_df[~answer_df["match_key"].isin(matched_df["match_key"])].copy()

# 결과 확인
print("❌ 매칭되지 않은 계정 항목 수:", len(unmatched_answer_df))
print(unmatched_answer_df[["계정코드", "계정명", "20231231 K 원"]])
unmatched_answer_df.to_csv("2023/missed_11.csv", index=False, encoding="utf-8-sig")

❌ 매칭되지 않은 계정 항목 수: 97
       계정코드                                      계정명     20231231 K 원
2    113203                             (유형자산정부보조금계)      258,000,000
3    113201                           (유형자산감가상각누계액계)  184,205,160,000
4    113205                           (유형자산손상차손누계액계)    2,344,920,000
5    113310               자산총계|비유동자산(계)|유형자산(계)|리스자산  114,539,309,000
6    113311             (유형자산감가상각누계액계)|(리스자산감가상각누계액)  131,590,009,000
..      ...                                      ...              ...
132  115123             부채와자본총계|부채총계|유동부채(계)|기타비금융부채   78,790,003,544
134  115153   부채와자본총계|부채총계|유동부채(계)|기타비금융부채|선수금|기타선수금    2,089,064,000
135  115210        부채와자본총계|부채총계|유동부채(계)|기타비금융부채|선수수익   76,700,939,544
136  115211              부채와자본총계|부채총계|유동부채(계)|단기충당부채        4,131,000
137  115219  부채와자본총계|부채총계|유동부채(계)|단기충당부채|기타단기부채성충당부채        4,131,000

[97 rows x 3 columns]


------------------------------------------------------------------------

원본계정명-정답계정명 수동 매핑

In [82]:
for result in all_results:
    if isinstance(result, BaseModel):
        result = result.model_dump()

    for label, v in result.items():
        val = v.get("value") if isinstance(v, dict) else getattr(v, "value", "")
        if isinstance(val, str) and val.replace(",", "") == "4246054":
            print(f"✅ GPT가 '{label}' 항목에서 195,585을 추출했습니다.")