## 크롤링

In [6]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6


In [19]:
import requests
from bs4 import BeautifulSoup
import json

# 기본 URL 설정
base_url = "http://contents.history.go.kr/front/kc"
list_url = f"{base_url}/setItemsKCList.do"
detail_url = f"{base_url}/viewAjax.do"

# 시대별 ID
age_ids = [2, 3, 4, 5, 6]  # 예: 2=삼국, 3=통일신라, 4=고려, 5=조선, 6=근대

# 데이터를 저장할 리스트
people_data = []  # kc_n
events_data = []  # kc_i

# 시대별 데이터 수집
for age_id in age_ids:
    print(f"Fetching data for age_id={age_id}...")
    params = {"age_id": age_id}
    response = requests.get(list_url, params=params)
    response.raise_for_status()

    # HTML 파싱
    soup = BeautifulSoup(response.text, 'html.parser')

    # JavaScript onclick 속성에서 데이터 추출
    items = soup.find_all('a', onclick=True)
    for item in items:
        onclick_attr = item['onclick']  # onclick 속성 값 추출
        title = item.text.strip()  # 링크의 텍스트 추출

        # kc_n 또는 kc_i 추출
        if "fnViewDetail" in onclick_attr:
            if "kc_n" in onclick_attr:
                level_id = onclick_attr.split("kc_n")[-1].split("'")[0]
                full_url = f"{detail_url}?levelId=kc_n{level_id}&whereStr=&searchYn="
                print(f"Fetching person data: {full_url}")
            elif "kc_i" in onclick_attr:
                level_id = onclick_attr.split("kc_i")[-1].split("'")[0]
                full_url = f"{detail_url}?levelId=kc_i{level_id}&whereStr=&searchYn="
                print(f"Fetching event data: {full_url}")

            # 세부 페이지 요청
            detail_response = requests.get(full_url)
            detail_response.raise_for_status()
            detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

            # 데이터 저장 구조
            data = {
                "title": title,
                "sections": [],
                "related_links": [],
                "toc": []
            }

            # 제목 추출
            title_tag = detail_soup.find('div', class_='md_tit')
            if title_tag:
                data["title"] = title_tag.get_text(strip=True)

            # 섹션별 텍스트 추출
            sections = detail_soup.find_all('div', class_='tx')
            for section in sections:
                data["sections"].append(section.get_text(strip=True))

            # 관련 링크 추출
            related_links = detail_soup.find('div', class_='hm_area')
            if related_links:
                links = related_links.find_all('a', href=True)
                for link in links:
                    data["related_links"].append({
                        "title": link.get('title', '').strip(),
                        "href": link.get('href', '').strip()
                    })

            # 목차 추출
            toc = detail_soup.find('div', class_='smenu_area')
            if toc:
                items = toc.find_all('li')
                for item in items:
                    data["toc"].append(item.get_text(strip=True))

            # 데이터 저장
            if "kc_n" in onclick_attr:
                people_data.append(data)
            elif "kc_i" in onclick_attr:
                events_data.append(data)

# 결과 출력
print(f"Found {len(people_data)} people records.")
print(f"Found {len(events_data)} events records.")

# JSON으로 저장
with open("people_data.json", "w", encoding="utf-8") as f:
    json.dump(people_data, f, ensure_ascii=False, indent=4)

with open("events_data.json", "w", encoding="utf-8") as f:
    json.dump(events_data, f, ensure_ascii=False, indent=4)


Fetching data for age_id=2...
Fetching person data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_n500700&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i502600&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i501700&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i503000&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i503100&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i502610&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i502800&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=kc_i502100&whereStr=&searchYn=
Fetching event data: http://contents.history.go.kr/front/kc/viewAjax.do?levelId=k