In [None]:
# list_field_items_to_excel_dual.py
# pip install requests openpyxl

import os
import time
import requests
from urllib.parse import quote
from openpyxl import Workbook
from concurrent.futures import ThreadPoolExecutor, as_completed

# -----------------------------
# 설정
# -----------------------------
API_KEY     = ""   # ← API 키 입력
BASE        = "https://devin.aks.ac.kr:8080/v1"   # OpenAPI v1 base
FIELDS      = ["문학", "역사"]                     # 수집할 분야
SLEEP       = 0.2                                  # 요청 간 짧은 대기
MAX_WORKERS = 5                                    # 동시 페이지 요청 개수
ERA_ALLOW   = ("근대", "현대")                      # 시대 필터

OUT_DIR        = "data"
OUT_XLSX_MAIN  = os.path.join(OUT_DIR, "field_modern_contemporary.xlsx")
OUT_XLSX_DICT  = os.path.join(OUT_DIR, "field_modern_contemporary_dict.xlsx")
os.makedirs(OUT_DIR, exist_ok=True)

HEADERS = {"X-API-Key": API_KEY}

# -----------------------------
# 워크북 준비 (기본 표)
# -----------------------------
wb_main = Workbook(write_only=True)
ws_main = wb_main.create_sheet(title="modern_contemporary")
BASE_COLS = ["eid", "headword", "origin", "field", "contentsType", "era", "url"]
ws_main.append(BASE_COLS)

# -----------------------------
# 사전형( Key/Value/notebook/priority ) 행 임시 저장
# -----------------------------
dict_rows = []  # 각 원소: [Key, Value, notebook, priority]

# -----------------------------
# 단일 페이지 요청
# -----------------------------
def fetch_page(field: str, page: int):
    enc = quote(field, safe="")
    url = f"{BASE}/Articles/Field/{enc}"
    r = requests.get(url, headers=HEADERS, params={"pageNo": page}, timeout=30)
    if r.status_code != 200:
        return page, [], []

    data = r.json() or {}
    articles = data.get("articles") or []

    results_main = []
    results_dict = []

    for a in articles:
        era = (a.get("era") or "").strip()
        if not any(tag in era for tag in ERA_ALLOW):
            continue

        origin = (a.get("origin") or "").strip()
        if not origin:             # ← origin 비어 있으면 스킵
            continue

        # 기본 표 행
        base_row = [a.get(c, "") for c in BASE_COLS]
        results_main.append(base_row)

        # 사전형 행
        key = origin
        value = (a.get("headword") or "").strip()
        notebook = "한국민족문화대백과사전"
        priority = len(key)  # 키의 글자수
        results_dict.append([key, value, notebook, priority])

        time.sleep(SLEEP)  # 가벼운 레이트리밋

    return page, results_main, results_dict

# -----------------------------
# 분야별 전체 수집
# -----------------------------
def fetch_field(field: str):
    enc = quote(field, safe="")
    url = f"{BASE}/Articles/Field/{enc}"
    r = requests.get(url, headers=HEADERS, params={"pageNo": 1}, timeout=30)
    data = r.json() or {}
    total_page = int(data.get("totalPage", 0) or 0)
    print(f"[INFO] {field} total pages = {total_page}")

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(fetch_page, field, p) for p in range(1, total_page + 1)]
        for fut in as_completed(futures):
            page, rows_main, rows_dict = fut.result()
            for row in rows_main:
                ws_main.append(row)
            dict_rows.extend(rows_dict)
            if page % 10 == 0 or page == total_page:
                print(f"[OK] {field}: page {page}/{total_page} done")

# -----------------------------
# 실행
# -----------------------------
for fld in FIELDS:
    print(f"=== FIELD: {fld} ===")
    fetch_field(fld)

# 파일 1) 기본 표 저장
wb_main.save(OUT_XLSX_MAIN)

# 파일 2) 사전형 저장 (열 순서 고정)
from openpyxl import Workbook as WB2
wb_dict = WB2(write_only=True)
ws_dict = wb_dict.create_sheet(title="dict")
ws_dict.append(["Key", "Value", "notebook", "priority"])
for row in dict_rows:
    ws_dict.append(row)
wb_dict.save(OUT_XLSX_DICT)
