In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
from datetime import datetime # 날짜 관련 연산
from tqdm.notebook import tqdm
import time
import re # 정규 표현식

In [2]:
def ex_tag(sid, page, date):
    ### 뉴스 분야(sid)와 페이지(page)를 입력하면 그에 대한 링크들을 리스트로 추출하는 함수 ###

    ## 1.
    url = f"https://news.naver.com/main/list.naver?mode=LS2D&mid=shm&sid2={sid}&sid1=105"\
    f"&date={date}&page={page}"
    html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"\
    "(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "\
    "Chrome/110.0.0.0 Safari/537.36"})
    soup = BeautifulSoup(html.text, "lxml")
    a_tag = soup.find_all("a", {'class':'nclicks(itn.2ndcont)'})
    
    ## 2.
    url_list = []
    for a in a_tag:
        url_list.append(a["href"])
                
    return url_list

In [3]:
def re_tag(sid, date):
    ### 특정 분야의 5페이지까지의 뉴스의 링크를 수집하여 중복 제거한 리스트로 변환하는 함수 ###
    re_lst = []
    for i in tqdm(range(5)):
        lst = ex_tag(sid, i+1, date)
        re_lst.extend(lst)

    # 중복 제거
    re_set = set(re_lst)
    re_lst = list(re_set)
    
    return re_lst

In [4]:
all_hrefs = {}
sids = [731, 226, 227, 230, 732, 283, 228]  # 분야 리스트
dates = [20231027, 2031026, 20231025]

# 각 분야별로 링크 수집해서 딕셔너리에 저장
for sid in sids:
    for date in dates:
        sid_data = re_tag(sid, date)
        all_hrefs[sid] = sid_data

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
def art_crawl(all_hrefs, sid, index):

    art_dic = {}
    
    ## 1.
    title_selector = "#title_area > span"
    date_selector = "#ct > div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span"
    main_selector = "#dic_area"
    press_selector = "#contents > div.media_end_linked_more > div > a > em"
  
    url = all_hrefs[sid][index]
    html = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 "\
    "(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"\
    "Chrome/110.0.0.0 Safari/537.36"})
    soup = BeautifulSoup(html.text, "lxml")
    
    ## 2.
    # 제목 수집
    title = soup.select(title_selector)
    title_lst = [t.text for t in title]
    title_str = "".join(title_lst)
    
    # 날짜 수집
    date = soup.select(date_selector)
    date_lst = [d.text for d in date]
    date_str = "".join(date_lst)
    
    # 본문 수집
    main = soup.select(main_selector)
    main_lst = []
    for m in main:
        m_text = m.text
        m_text = m_text.strip()
        main_lst.append(m_text)
    main_str = "".join(main_lst)
    
    # 출판사 수집
    press = soup.select(press_selector)
    press_lst = [p.text for p in press]
    press_str = "".join(press_lst)
    
    ## 3.
    art_dic["date"] = date_str
    art_dic["press"] = press_str
    art_dic["title"] = title_str
    art_dic["document"] = main_str
    
    return art_dic

In [6]:
# 모든 섹션의 데이터 수집 (제목, 날짜, 본문, section, url)
section_lst = [731, 226, 227, 230, 732, 283, 228]
artdic_lst = []

for section in tqdm(section_lst):
    for i in tqdm(range(len(all_hrefs[section]))):
        art_dic = art_crawl(all_hrefs, section, i)
        art_dic["category"] = section
        art_dic["link"] = all_hrefs[section][i]
        artdic_lst.append(art_dic)

df = pd.DataFrame(artdic_lst)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [7]:
def category(x):
    if x == 731:
        return '모바일'
    elif x == 226:
        return '인터넷/SNS'
    elif x == 227:
        return '통신/뉴미디어'
    elif x == 230:
        return 'IT 일반'
    elif x == 732:
        return '보안/해킹'
    elif x == 283:
        return '컴퓨터'
    else:
        return '과학 일반'

df['category'] = df['category'].apply(category)
df['date'] = df['date'].apply(lambda x:x[:19])

In [8]:
df.to_excel('자연어처리_과제_최재윤.xlsx', index=False, encoding='utf-8')