In [None]:
from bs4 import BeautifulSoup
from lxml import html as LH
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Dict
import re, os, json
import math
import time
import requests

In [None]:
BASE_URL = "https://terms.tta.or.kr/dictionary/searchFirstList.do"
XPATH_UL = "/html/body/div[2]/div[2]/div/form/div/div[1]/ul"
DETAIL_URL = "https://terms.tta.or.kr/dictionary/dictionarySearchFirstListAction.do"

CATEGORY_CODE = {
    "용어사전": 51,
    "시사상식": 107,
    "TTA표준": 50,
    "기타참고": 202,
}

print("카테고리 코드:", CATEGORY_CODE)

# 한글 초성 (ㄱ~ㅎ)
# KOREAN_INITIALS = ['ㄱ','ㄴ','ㄷ','ㄹ','ㅁ','ㅂ','ㅅ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
KOREAN_INITIALS = ['ㅋ','ㅌ','ㅍ','ㅎ']

ALPHABETS = [chr(code) for code in range(ord("A"), ord("Z") + 1)]

DIGITS = [str(i) for i in range(10)]

FIRST_LETTERS = KOREAN_INITIALS # + ALPHABETS + DIGITS

In [None]:
def _with_timeout(request_fn, timeout_tuple):
    def wrapped(method, url, **kwargs):
        if "timeout" not in kwargs:
            kwargs["timeout"] = timeout_tuple
        return request_fn(method, url, **kwargs)
    return wrapped

def build_session(timeout_connect=5, timeout_read=10, total_retries=3):
    session = requests.Session()
    retries = Retry(
        total=total_retries,
        backoff_factor=0.5,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["HEAD", "GET", "OPTIONS", "POST"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.request = _with_timeout(session.request, (timeout_connect, timeout_read))
    return session

session = build_session()

In [None]:
def build_payload(
    category: str,
    initial_consonant: str = "ㄱ",
    page: int = 1,
    list_count: int = 10,
    search_content: str = "conts01",
    orderby: str = "kor_subject",
    orderby_option: str = "TRUE",
) -> Dict[str, str]:
    if category not in CATEGORY_CODE:
        raise ValueError(f"알 수 없는 카테고리: {category} (가능: {list(CATEGORY_CODE.keys())})")
    return {
        "searchContent": search_content,
        "searchRange": "all",
        "listCount": str(list_count),
        "listPage": str(page),
        "orderby": orderby,
        "reFlag": "N",
        "orderbyOption": orderby_option,
        "conts01WhereSet": "",
        "firstWordVal": initial_consonant,
        "firstWord": "Y",
        "word_seq": "",
        "div_big_cd_in": str(CATEGORY_CODE[category]),
        "div_big_cd": "",
        "searchTerm": "",
        "searchCate": "bigram",
    }

def fetch_first_list(
    category: str,
    initial_consonant: str = "ㄱ",
    page: int = 1,
    list_count: int = 10,
    search_content: str = "conts01",
    orderby: str = "kor_subject",
    orderby_option: str = "TRUE",
) -> str:
    payload = build_payload(category, initial_consonant, page, list_count, search_content, orderby, orderby_option)
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; TTA-Scraper/1.0)",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "https://terms.tta.or.kr",
        "Referer": "https://terms.tta.or.kr/dictionary/searchFirstList.do",
    }
    resp = session.post(BASE_URL, data=payload, headers=headers)
    resp.raise_for_status()
    return resp.text


def fetch_detail(word_seq: str) -> dict:
    payload = {"word_seq": word_seq}
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; TTA-Scraper/1.0)",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "https://terms.tta.or.kr",
        "Referer": "https://terms.tta.or.kr/dictionary/searchFirstList.do",
    }

    resp = requests.post(DETAIL_URL, data=payload, headers=headers, timeout=10)
    resp.raise_for_status()

    js = resp.json()
    data = js.get("data", {})
    kor_subject = data.get("kor_subject", "").strip()

    contents_html = data.get("contents", "").strip()
    soup = BeautifulSoup(contents_html, "lxml")
    contents = re.sub(r"\s+", " ", soup.get_text(" ", strip=True))

    return {
        "word_seq": word_seq,
        "kor_subject": kor_subject,
        "contents": contents,
    }

def extract_total_count(html: str) -> int:
    soup = BeautifulSoup(html, "lxml")
    target = soup.find("div", string=re.compile(r"총\s*\d+\s*건"))
    if not target:
        return 0
    match = re.search(r"총\s*([0-9,]+)\s*건", target.get_text())
    if match:
        return int(match.group(1).replace(",", ""))
    return 0

def extract_first_seq_values(html_text: str) -> list:
    tree = LH.fromstring(html_text)
    return [v.strip() for v in tree.xpath(f"{XPATH_UL}//input[@name='first_seq']/@value") if v.strip()]

def fetch_all_pages(category: str, initial: str, list_count: int = 10, delay: float = 0.5):
    first_page_html = fetch_first_list(category=category, initial_consonant=initial, page=1, list_count=list_count)
    total = extract_total_count(first_page_html)
    if total == 0:
        print(f"{category}/{initial}: 결과 없음")
        return 0

    total_pages = math.ceil(total / list_count)
    print(f"{category}/{initial}: 총 {total}건, {total_pages}페이지 탐색 중...")

    all_values = []
    results = []
    for page in range(1, total_pages + 1):
        try:
            html = fetch_first_list(category=category, initial_consonant=initial, page=page, list_count=list_count)
            values = extract_first_seq_values(html)
            print(f"  - p{page}: {values}")

            for value in values:
                try:
                    data = fetch_detail(value)
                    print(f"   {data['word_seq']} | {data['kor_subject']}")
                    print(f"   {data['contents']}")
                    results.append({data['kor_subject']: data['contents']})
                    time.sleep(0.3)
                except Exception as e:
                    print(f"{value} 요청 실패: {e}")

            all_values.extend(values)
            time.sleep(delay)
        except Exception as e:
            print(f"p{page} 실패: {e}")
            continue

    print(f"✅ {category}/{initial}: 총 {len(all_values)}개 수집 완료")

    os.makedirs("tta_results", exist_ok=True)
    safe_cat = category.replace("/", "_").replace("\\", "_").strip()
    safe_init = initial.replace("/", "_").replace("\\", "_").strip()
    out_path = os.path.join("tta_results", f"{safe_cat}_{safe_init}.json")

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"저장 완료 → {out_path}\n")
    return 0

In [None]:
categories = ["용어사전", "시사상식", "TTA표준", "기타참고"]

In [None]:
for cat in categories:
    print(f"\n=== {cat} ===")
    for initial in FIRST_LETTERS:
        try:
            fetch_all_pages(category=cat, initial=initial, list_count=10)
        except Exception as e:
            print(f"[{cat}-{initial}] 실패:", e)