# 한국경제 기사 크롤링

### import 및 기본 정의

In [1]:
from newspaper import Article
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd

BASE_URL = "https://www.hankyung.com/article/"

def crawl_url(url):
    """단일 URL 크롤링"""
    try:
        article = Article(url, language='ko')
        article.download()
        article.parse()
        if not article.title:
            return None
        return {
            "url": url,
            "title": article.title,
            "text": article.text,
            "publish_date": article.publish_date
        }
    except:
        return None

In [2]:
day_str = '20250818'

### i 기사

In [3]:
def crawl_i_articles(date_str, max_workers=40):
    """i 기사 크롤링 (0~9999)"""
    urls = [f"{BASE_URL}{date_str}{num}i" for num in range(10000)]
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(crawl_url, url): url for url in urls}

        for future in tqdm(as_completed(futures), total=len(futures), desc=f"I {date_str}"):
            res = future.result()
            if res:
                results.append(res)

    df = pd.DataFrame(results)
    df.to_csv(f"/Users/leesangwon/Documents/ThemeStock_file/hankyung_i_{date_str}.csv", index=False)
    return df.head()

# 사용
df_i = crawl_i_articles(day_str)

I 20250818: 100%|██████████| 10000/10000 [01:43<00:00, 96.48it/s]


### g 기사

In [4]:
def crawl_g_articles(date_str, max_workers=40):
    """g 기사 크롤링 (0~9999)"""
    urls = [f"{BASE_URL}{date_str}{num}g" for num in range(10000)]
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(crawl_url, url): url for url in urls}

        for future in tqdm(as_completed(futures), total=len(futures), desc=f"G {date_str}"):
            res = future.result()
            if res:
                results.append(res)

    df = pd.DataFrame(results)
    df.to_csv(f"/Users/leesangwon/Documents/ThemeStock_file/hankyung_g_{date_str}.csv", index=False)
    return df.head()

# 사용
df_g = crawl_g_articles(day_str)

G 20250818: 100%|██████████| 10000/10000 [01:44<00:00, 96.10it/s]


### 일반기사

In [5]:
def crawl_general_articles(date_str, max_workers=40):
    """일반 기사 크롤링 (0~99999)"""
    urls = [f"{BASE_URL}{date_str}{num}" for num in range(100000)]
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(crawl_url, url): url for url in urls}

        for future in tqdm(as_completed(futures), total=len(futures), desc=f"General {date_str}"):
            res = future.result()
            if res:
                results.append(res)

    df = pd.DataFrame(results)
    df.to_csv(f"/Users/leesangwon/Documents/ThemeStock_file/hankyung_general_{date_str}.csv", index=False)
    return df.head()

# 사용
df_general = crawl_general_articles(day_str)

General 20250818: 100%|██████████| 100000/100000 [13:49<00:00, 120.53it/s]


---

In [6]:
df_general

Unnamed: 0,url,title,text,publish_date
0,https://www.hankyung.com/article/2025081847244,한국경제,트럼프 대통령이 푸틴 러시아 대통령과 만났습니다. 회담은 뉴욕 증시가 마감할 무렵 ...,NaT
1,https://www.hankyung.com/article/2025081847254,한국경제,트럼프 대통령이 푸틴 러시아 대통령과 만났습니다. 회담은 뉴욕 증시가 마감할 무렵 ...,NaT
2,https://www.hankyung.com/article/2025081847727,"美 트럼프 무역장벽에 '도미노', 韓 직격탄…규제 철강 집중 [영상]",영상 모듈 닫기\n\n/사진=뉴스1\n\n미국이 수입 철강 제품에 50%의 품목 관...,2025-08-18 06:46:02+09:00
3,https://www.hankyung.com/article/2025081847767,"""10년 전 격전 재연한다""…'애플왕국' 맹공하는 삼성전자","갤럭시 폴드7 등 선보이며\n\n미국 내 점유율 크게 올려\n\n\n\nCNBC ""...",2025-08-18 06:41:11+09:00
4,https://www.hankyung.com/article/2025081847734,한국경제,트럼프 대통령이 푸틴 러시아 대통령과 만났습니다. 회담은 뉴욕 증시가 마감할 무렵 ...,NaT


In [4]:
from newspaper import Article

In [5]:
base = "https://www.cnbc.com/2025/08/10/ai-agents-drafted-into-cybersecurity-defense-forces-of-companies.html"

In [6]:
article = Article(base, language='ko')

article.download()
article.parse()
title = article.title
text = article.text
date = article.publish_date

In [None]:
print(title)
print(text)
print(date)

'AI agents are being drafted into the cyber defense forces of corporations'