## 주가 관련 기사 크롤링

In [196]:
from ssl import SSLError
import pandas as pd
import datetime
from urllib import parse
from urllib.error import URLError
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import socket
import random
import argparse
import datetime
import pandas as pd
import requests
import nltk
import warnings
import random
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [232]:
def crawl(query, save_as, begin, end, sort=0, field=1, delay=0.5, timeout=30, page_limit=50):
    '''
    :param query: 네이버 '뉴스'란에서 검색할 검색어
    :param save_as: 검색 결과 저장 경로
    :param begin: '기간' -> 검색 기간 시작
    :param end: '기간' -> 검색 기간 끝
    :param sort: '유형' -> 0(관련도순) 1(최신순) 2(오래된순)
    :param field: '영역' -> 0(전체) 1(제목)
    :param delay: (옵션) 검색 리퀘스트 간격 (초)
    :param timeout: (옵션) 타임아웃 시 기다릴 시간 (초)
    :param page_limit: (옵션) 검색 결과에서 몇 페이지까지 갈 것인지 결정
    :return:
    '''

    # prerequisite
    df = pd.DataFrame(columns=['link', 'title', 'date', 'article'])

    # index settings
    # a single pages includes 10 news, starting from page 1 (index 1~10)
    current_index = 1
    max_index = 2

    while (current_index <= max_index) and (1 + current_index // 10 <= page_limit):
        url = make_url(query, sort, field, begin, end, current_index)
        bsobj = make_bsobj(url, delay, timeout, trial=10)
        if bsobj is None:
            continue
        naver_news_urls = make_naver_news_urls(bsobj)
        naver_news_title = get_naver_news_title(bsobj)
        try:
            img_url = get_article_img_url(bsobj)
        except:
            img_url = " "
        naver_news_articles = []
        try:
            if len(naver_news_urls) != 0:
                url = naver_news_urls[0]
                news_bsobj = BeautifulSoup(url, 'lxml')

                naver_news_article = get_naver_news_article_one(bsobj)
                naver_news_articles.append(naver_news_article[0])
            else:
                naver_news_title = " "
                naver_news_articles = " "
                img_url = " "

    #             date, article, title, newspaper = attributes
            df = pd.DataFrame([ x for x in zip(naver_news_title, naver_news_articles, img_url)])
            df.columns = ['naver_news_title', 'naver_news_articles', 'naver_news_img_url']
            current_index += 10

            max_index = get_max_index(bsobj)
            return df
        except:
            naver_news_title = " "
            naver_news_articles = " "
            img_url = " "
            df = pd.DataFrame([ x for x in zip(naver_news_title, naver_news_articles, img_url)])
            df.columns = ['naver_news_title', 'naver_news_articles', 'naver_news_img_url']
            return df

In [174]:
def get_max_index(bsobj):
    paging = bsobj.find_all('a', 'btn_next')
    if not paging:
        print('(WARNING!) no results found')
        return None
    return True

In [175]:
def make_naver_news_urls(bsobj):
    return [link['href'] for link in bsobj.find_all('a', 'news_tit')]

def get_naver_news_title(bsobj):
    return [link.text for link in bsobj.find_all('a', 'news_tit')]

def get_naver_news_article_one(bsobj):
    return [article.text for article in bsobj.find_all('a', 'api_txt_lines dsc_txt_wrap')]

def get_article_img_url(bsobj):
    return [img['src'] for img in bsobj.find_all('img', 'thumb api_get')]

def get_naver_news_article(url, news_bsobj):
    try:
        html = urlopen(url).read()
        soup = BeautifulSoup(html, "lxml")
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        return text
    except:
        return None

In [176]:
def make_url(query, sort, field, begin, end, page):
    url = "https://search.naver.com/search.naver?&where=news&query=" + parse.quote(query)
    url += "&sort=%i" % sort
    url += "&field=%i" % field
    url += "&ds=" + begin + "&de=" + end
    url += "&nso=so:r,p:"
    url += "from" + begin.replace(".", "") + "to" + end.replace(".", "")
    url += "&start=" + str(page)
    return url

In [177]:
def make_bsobj(url, delay=0.5, timeout=30, trial=10):
    ua = UserAgent(verify_ssl=False)
    count = 0

    while count < trial:
        try:
            time.sleep(delay + random.random())
            html = urlopen(Request(url=url, headers={'User-Agent': ua.random}), timeout=timeout)
            bsobj = BeautifulSoup(html, 'lxml')
            return bsobj
        except (URLError, SSLError, socket.timeout) as e:
            print('(Error)', e)
            print('reloading...')
            count += 1
            time.sleep(timeout)
    return None

In [178]:
url = 'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&photo=0&field=1&pd=3&ds=2015.01.01&de=2015.01.30&cluster_rank=13&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20150101to20150130,a:t&start=11'
delay=0.5
timeout=30
bsobj = make_bsobj(url, delay, timeout, trial=10)
max_index = get_max_index(bsobj)
if max_index is None:
    print(1)

In [179]:
def get_arguments():
    # Argument configuration
    parser = argparse.ArgumentParser()
    parser.add_argument('--query', type=str, required=True, help='query to search on NAVER')
    parser.add_argument('--begin', type=str, required=True, help='crawling begin point (%%Y.%%m.%%d format)')
    parser.add_argument('--end', type=str, required=True, help='crawling end point (%%Y.%%m.%%d format)')
    parser.add_argument('--sort', type=int, default=0, help='search result sorting: 0(relevant), 1(newest), 2(oldest)')
    parser.add_argument('--field', type=int, default=1, help='search field: 0(all), 1(title)')
    return parser.parse_args()

In [180]:
def ran_num(n):
    ls = []
    num = random.randint(1, n)
    while n in ls :
        num = random.randint(1, n)
        ls.append(num)
    return num

In [206]:
df = pd.read_csv('news_data.csv', index_col=0)

In [188]:
df_e = crawl('엔씨소프트', f'./test.xlsx', f'2021.10.12', f'2021.06.06')

making url https://search.naver.com/search.naver?&where=news&query=%EC%97%94%EC%94%A8%EC%86%8C%ED%94%84%ED%8A%B8&sort=0&field=1&ds=2021.10.12&de=2021.06.06&nso=so:r,p:from20211012to20210606&start=1
['https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F009%2F2021%2F10%2F12%2F4863710.jpg&type=ff264_180&expire=2&refresh=true', 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F5492%2F2021%2F10%2F12%2F189019.jpg&type=ofullfill264_180_gray&expire=2&refresh=true', 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F5325%2F2021%2F10%2F12%2F245690.jpg&type=ofullfill264_180_gray&expire=2&refresh=true', 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F5268%2F2021%2F10%2F12%2F1543368.jpg&type=ff264_180&expire=2&refresh=true', 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F055%2F2021%2F10

In [207]:
df['start_date'] = list(map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').strftime('%Y.%m.%d'), df['start_date']))
df['end_date'] = list(map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').strftime('%Y.%m.%d'), df['end_date']))

In [233]:
news_title = []
news_article = []
news_img_url = []
for index, row in df.iterrows():
    df_e = crawl(row['stock'], f'./test.xlsx', row['start_date'], row['end_date'])
    news_title.append(df_e.iat[0,0])
    news_article.append(df_e.iat[0,1])
    news_img_url.append(df_e.iat[0,2])



In [243]:
problem_data = pd.read_csv('./problem_data.csv', index_col=0)
problem_data['news_title'] = news_title
problem_data['news_article'] = news_article
problem_data['news_img_url'] = news_img_url

In [253]:
drop_index = problem_data[problem_data['news_title'] == ' '].index
problem_data.drop(drop_index, inplace=True)

In [258]:
problem_data.reset_index(drop=True)

Unnamed: 0,code,stock,image,answer,explanation,news_title,news_article,news_img_url
0,036570,엔씨소프트,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"엔씨소프트 '리니지2M', 내달 2일 해외 29개국 출시",국내 출시 2주년 기념 업데이트도 실시 엔씨소프트는 모바일 다중접속역할수행게임(MM...,https://search.pstatic.net/common/?src=https%3...
1,011785,금호석유우,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"금호석유·금호석유우, 희비 엇갈려…금호석유 1% 하락 반해 금호석유우 1% 상...",금호석유와 금호석유우의 희비가 엇갈리고 있다. 금호석유는 4일 코스피 시장에서 오후...,https://search.pstatic.net/common/?src=https%3...
2,302440,SK바사,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"與 ""모더나 어디까지 믿을지 자신 없어…SK바사 백신 3상 곧 승인""","게 SK바이오사이언스로, 곧 식품의약품안전처로부터 3상 승인을 받을 것""이라며 ""임...",https://search.pstatic.net/common/?src=https%3...
3,000725,현대건설우,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,[투자심리과열 종목] 상승 TOP5 웹케시·삼양홀딩스우·SUN&L·코스모신소재·...,"또 코스모신소재 주가는 전일 대비 5.09% 오른 2만3750원, 현대건설우는 전일...",https://search.pstatic.net/common/?src=https%3...
4,010145,삼성중공우,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,[특징주] 삼성중공우 29% 급등세 '초대형 컨테이너선 5척 수주',삼성중공우 29% 급등세 '초대형 컨테이너선 5척 수주' 삼성중공우가 강세다. 12...,https://search.pstatic.net/common/?src=https%3...
5,010145,삼성중공우,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"[특징주] 삼성중공우, 3거래일 연속 상승세",삼성중공우 주가가 연일 상승세다. 16일 한국거래소에 따르면 삼성중공우는 11시 현...,https://search.pstatic.net/common/?src=https%3...
6,008775,호텔신라우,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"유통주 상승 우세, 호텔신라우 엔에스쇼핑 이마트 오르고 신세계 내려",8일 호텔신라 우선주 주가는 전날보다 1.52%(1300원) 상승한 8만7천 원에 ...,https://search.pstatic.net/common/?src=https%3...
7,293490,카카오게임즈,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"카카오게임즈 ‘이터널 리턴’, 다음게임 서비스 사전예약 돌입","카카오게임즈(각자대표 남궁훈, 조계현)는 PC온라인 전략생존게임 ‘이터널 리턴’의 ...",https://search.pstatic.net/common/?src=https%3...
8,28513K,SK케미칼우,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,[특징주] SK케미칼·SK케미칼우 8% 강세 '친환경 포장재 사업',SK케미칼·SK케미칼우 주가가 강세다. 2일 오후 2시 56분 기준 SK케미칼전일...,https://search.pstatic.net/common/?src=https%3...
9,263750,펄어비스,b'iVBORw0KGgoAAAANSUhEUgAABLAAAAMgCAYAAAAz4JsC...,high,,"앱코, 펄어비스와 협업…‘검은사막’ IP 활용 게이밍 기어 출시",국내 1위 게이밍 기어 기업 앱코(129890)가 펄어비스의 대표 게임 ‘검은사막’...,https://search.pstatic.net/common/?src=https%3...


In [263]:
problem_data.to_csv('result.csv')