In [3]:
import requests, pandas as pd, time
from newspaper import Article
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from tqdm import tqdm

BASE_URL = "https://www.hankyung.com/article/"
HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/127.0.0.0 Safari/537.36"),
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8",
    "Referer": "https://www.hankyung.com/",
}

def crawl_url(url):
    """단일 URL 크롤링: AMP 먼저 요청, 없으면 일반 페이지 시도"""
    try:
        # 1. AMP 페이지 먼저 시도
        amp = url.rstrip("/") + "/amp"
        r = requests.get(amp, headers=HEADERS, timeout=12)

        # 2. AMP 없으면 일반 페이지 시도
        if r.status_code == 404:
            r = requests.get(url, headers=HEADERS, timeout=12)

        # 3. 여전히 에러면 None
        if r.status_code in (403, 404):
            return None
        r.raise_for_status()

        # 4. newspaper3k 파싱
        art = Article(url, language='ko')
        art.set_html(r.text)
        art.parse()
        if not art.title or not art.text:
            return None

        return {
            "url": url,
            "title": art.title,
            "text": art.text,
            "publish_date": art.publish_date
        }
    except Exception:
        return None

def crawl_gen_articles_range(date_str, start_id, end_id, *,  # end_id는 '미포함' (파이썬 range 규칙)
                             max_workers=36, batch_size=1000, sleep_between=0.05,
                             save_path=None):
    """
    일반 기사 크롤링: [start_id, end_id) 구간만
      - AMP만 요청
      - 배치 제출
    """
    results, collected = [], 0
    total = end_id - start_id

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        with tqdm(total=total, desc=f"{date_str} [{start_id:05}-{end_id-1:05}]") as pbar:
            for start in range(start_id, end_id, batch_size):
                end = min(start + batch_size, end_id)
                urls = [f"{BASE_URL}{date_str}{num:05}" for num in range(start, end)]
                futures = [ex.submit(crawl_url, u) for u in urls]

                for fut in as_completed(futures):
                    res = fut.result()
                    if res:
                        results.append(res)
                        collected += 1
                    pbar.set_postfix_str(f"수집 {collected}")
                    pbar.update(1)
                    time.sleep(sleep_between)

    df = pd.DataFrame(results).drop_duplicates(subset=["url"])
    if save_path:
        df.to_csv(save_path, index=False)
    tqdm.write(f"[{start_id:05}-{end_id-1:05}] 최종 수집: {len(df)}")
    return df

In [4]:
day_str = "20250822"
start_id, end_id = 80000, 100000   # 이 부분만 각 노트북에서 바꿔서 실행
out_path = f"/Users/leesangwon/Documents/ThemeStock_file/Hankyung_news/hankyung_gen_{day_str}_{start_id:05}-{end_id-1:05}.csv"

df_part = crawl_gen_articles_range(
    day_str, start_id, end_id,
    max_workers=36, batch_size=1000, sleep_between=0.05,
    save_path=out_path
)

df_part

20250822 [80000-99999]: 100%|██████████| 20000/20000 [18:21<00:00, 18.16it/s, 수집 0]

[80000-99999] 최종 수집: 0



