In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm

In [2]:
press_list = ['경향신문', '국민일보', '뉴스1', '뉴시스', '동아일보', '문화일보', '서울신문', '세계일보', '연합뉴스',  '조선일보', '중앙일보', '한겨레', '한국일보']

stop_content = ["무단전재", "재배포금지","저작권자 ⓒ 서울신문사","무단복제 및 전재","무단 전재 및 재배포","제보는 카톡", "☞", "무단 전재-재배포", "▶연합뉴스 앱 지금 바로 다운받기~"]

# rep_list = [u'\xa0', u'\u2027', u'\u30fb', u'\u2024', u'\uff65', u'\u2014', u'\u22c5', u'\u207a', u'\u2219', u'\u2e31', u'\u200d']
rep_list = ['기사내용 요약']

stop_title = ['[부고]', '[인사]', '[표]', '[모멘트]', '[궂긴소식]']

# 함수 정리

In [3]:
def create_soup(url):
    i = 0
    ## 요청 오류시 10번  재시도
    while i < 10 :
        try:
            res = requests.get(url)
            res.raise_for_status()
            soup = BeautifulSoup(res.content, 'html.parser', from_encoding='cp949')
            break
        except:
            i += 1
            
    return soup

In [4]:
def content_scraper(link):
    article_soup = create_soup(link)
    
    # 기자없는 기사 걸러내기
    journalist = article_soup.select('div>span')[0].text.split('기자')[0].strip() + ' 기자'
    if '입력' in journalist or journalist == ' 기자':
        return False
    
    # 본문 정리
    article = article_soup.find_all('section')[1]
    content = article.find_all(True, attrs={'dmcf-ptype':'general'})
    rst = []
    for para in content:
        for tmp in para.text.split('\n'):
            if tmp.strip() != '':
                rst.append(tmp.strip())
    content = []
    for c in rst:
        for i in stop_content:
            if i in c:
                break
        else:
            for rep in rep_list:
                c = c.replace(rep, '')
            content.append(c)
    content = ' '.join(content)
    if len(content) < 200:
        return False
    return content, journalist

In [5]:
def get_articles(date, category):
    articles = []
    # 마지막 리스트 페이지 체크
    url = f'https://news.daum.net/breakingnews/{category}?page=999&regDate={date}'
    soup = create_soup(url)
    last_page = soup.find("em",attrs ={"class": "num_page"})
    last_page_num = int(re.sub(r'[^0-9]', '', last_page.text))
    # last_page_num = 10
    
    for page in tqdm(range(1, last_page_num+1), desc=date):
        url = f'https://news.daum.net/breakingnews/{category}?page={page}&regDate={date}'
        soup = create_soup(url)
        news_list = soup.find_all('strong', attrs={'class':'tit_thumb'})
        
        # 리스트의 뉴스 하나씩 처리
        for i in range(len(news_list)-4):
            article = []
            press = news_list[i].find('span').text.split('·')[0].strip()
            if press not in press_list:
                continue
            title = news_list[i].find('a').text
            tflag = True
            for t in stop_title:
                if t in title:
                    tflag = False
                    break
            if not tflag:
                continue
            link = news_list[i].find('a')['href']
            # print(link, end='\r')
            date = news_list[i].find('a')['href'].split('/')[-1][:8]
            arcid = news_list[i].find('a')['href'].split('/')[-1][8:]
            category = url.split('/')[-1].split('?')[0]
            checker = content_scraper(link)
            if not checker:
                continue
            content, journalist = checker
            
            article = [arcid, press, title, content, journalist, date, link, category]
            articles.append(article)
    df = pd.DataFrame(articles, columns=['arcid', 'press', 'title', 'content', 'journalist', 'date', 'link', 'category'])
    return df

# 기간 설정 스크래핑

In [6]:
from datetime import date
from dateutil.rrule import rrule, DAILY

In [7]:
start_date = date(2021, 12, 6)
end_date = date(2021, 12, 6)

In [8]:
log = []
for date_dt in list(rrule(DAILY, dtstart=start_date, until=end_date)):
    try:
        df_day = pd.DataFrame()
        date_s = date_dt.strftime('%Y%m%d')
        df_day = pd.concat([df_day, get_articles(date_s, 'society')])
        df_day.reset_index(drop=True, inplace=True)
        df_day.to_csv(f'../data/{date_s}_society.csv', index=False)
    except:
        ## 최종적으로 실패 시 로그에 해당 날짜 추가
        log.append(date_s)
        print(date_s)

if len(log) > 0 :
    logdf = pd.DataFrame(log)
    logdf.to_csv(f'../data/faillog_society.csv', index=False)

20211206: 100%|██████████████████████████████████████████████████████████████████████| 471/471 [08:17<00:00,  1.06s/it]
