# 2020-04-04

1. 네이버 기사 제목과 주요뉴스의 내용을 중심으로 키워드 추출 -> 코로나에 대한 관심도의 정도 (의식의 정도)
2. 그렇게 추출한 키워드를 가지고 타임라인을 만들어 냄.
3. 코로나 관련 키워드와 아닌 키워드를 군집화.

# 네이버 뉴스 크롤링

2020-04-03 랭킹뉴스 url 살펴보기 

각 랭킹뉴스에는 총 30개의 기사가 있다.
 - 정치 : https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=100&date=20200403
 - 경제 : https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=101&date=20200403
 - 사회 : https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=102&date=20200403
 - 생활/문화 : https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=103&date=20200403
 - 세계 : https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=104&date=20200403
 - IT/과학 https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=105&date=20200403


기본 : https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&
 - 정치 : sectionId=100
 - 경제 : sectionId=101
 - 사회 : sectionId=102
 - 생활/문화 : sectionId=103
 - 세계 : sectionId=104
 - IT/과학 : sectionId=105


날짜 : 20200403(년월일)

# Crawling ranking news and refining the text
# Extracting keyword and multiply weight

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import time
import datetime
from konlpy.tag import Komoran

In [2]:
# 주요뉴스 헤드라인, 조회수, 내용, 링크 크롤링
def get_news(sectionid, date):
    url = "https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=" + str(sectionid) + "&date=" + str(date)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    ranking_text = soup.find_all(class_ = 'ranking_text')
    
    l = []
    for item in ranking_text:
        d = {}
        d['LinkSrc'] = item.find('a')['href']
        d['Title'] = item.find('a')['title']
        d['Views'] = item.find(class_ = "ranking_view").get_text()
        l.append(d)
    
    for link in l:
        resp = requests.get("http://news.naver.com" + link['LinkSrc'])
        soup = BeautifulSoup(resp.text, "html.parser")
        content = soup.find(id="articleBodyContents")
        link['Content'] = clean_text(content)
        
    df = pd.DataFrame(l)
    return df

In [3]:
# text 정제하기
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스 오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    return cleaned_text

In [4]:
komoran = Komoran()
stop_words = ['에서', '으로', '도록', '기자', '다는', '지만', '라고', '남은', '습니다', '헤럴드', '의원', '대표', '뉴스룸', '뉴스', '서울경제', '뉴시스', '이다']

In [11]:
def extract_keyword(df):
    total_dict = {}
    weights = 30
    for content in df['Content']:
        # 각 기사의 키워드 counting순으로 5개 추출
        tokens = komoran.morphs(content)#리스트형태로 반환
        tokens = [token for token in tokens if len(token) > 1 and token not in stop_words]
        d = {token : content.count(token) for token in tokens}
        d = dict(sorted(d.items(), key = operator.itemgetter(1), reverse = True)[:5])
        
        # 랭킹별로 각 기사의 키워드에 가중치 곱하고 total_dict에 합치기
        for key in d.keys():
            d[key] = d[key] * weights
            total_dict[key] = total_dict.get(key, 0) + d[key]
            
        for key in d.keys():
            l = list()
            l.append(total_dict[key])
            total_dict[key] = l
            
        weights = weights -1
        
    total_dict = dict(sorted(total_dict.items(), key = operator.itemgetter(1), reverse = True)[:10])
    return total_dict

# observe the ranking news (0120~0406)

In [12]:
def politic_keywords():
    politic_dataframe = pd.DataFrame()
    date_index = pd.date_range(start = '20200120', end = '20200406')
    date_list = date_index.strftime("%Y%m%d").tolist()
    for i in date_list:
        start = time.time()
        politic_news = get_news(100, i)
        politic_keyword = extract_keyword(politic_news)
        politic_dataframe[i] = politic_keyword
        #print(i, time.time()-start)
    return politic_dataframe

In [13]:
politic_keywords().to_csv('./politics_0406.csv', sep=',', encoding = "utf-8")

TypeError: can only concatenate list (not "int") to list

In [None]:
def economic_keywords():
    economic_dataframe = pd.DataFrame()
    date_index = pd.date_range(start = '20200120', end = '20200406')
    date_list = date_index.strftime("%Y%m%d").tolist()
    for i in date_list:
        start = time.time()
        economic_news = get_news(101, i)
        economic_keyword = extract_keyword(economic_news)
        economic_dataframe[i] = economic_keyword
        print(i, time.time()-start)
    return economic_dataframe

In [None]:
economic_keywords().to_csv('./economic_0406.csv', sep=',', encoding = "utf-8")

In [None]:
def society_keywords():
    society_dataframe = pd.DataFrame()
    date_index = pd.date_range(start = '20200120', end = '20200406')
    date_list = date_index.strftime("%Y%m%d").tolist()
    for i in date_list:
        start = time.time()
        society_news = get_news(102,i)
        society_keyword = extract_keyword(society_news)
        society_dataframe[i] = society_keyword
        print(i, time.time()-start)
    return society_dataframe

In [None]:
society_keywords().to_csv('./society_0406.csv', sep=',', encoding = "utf-8")

In [None]:
def culture_keywords():
    culture_dataframe = pd.DataFrame()
    date_index = pd.date_range(start = '20200120', end = '20200406')
    date_list = date_index.strftime("%Y%m%d").tolist()
    for i in date_list:
        start = time.time()
        culture_news = get_news(103,i)
        culture_keyword = extract_keyword(culture_news)
        culture_dataframe[i] = culture_keyword
        print(i, time.time()-start)
    return culture_dataframe

In [None]:
culture_keywords().to_csv('./culture_0406.csv', sep=',', encoding = "utf-8-sig")

In [None]:
def global_keywords():
    global_dataframe = pd.DataFrame()
    date_index = pd.date_range(start = '20200120', end = '20200406')
    date_list = date_index.strftime("%Y%m%d").tolist()
    for i in date_list:
        start = time.time()
        global_news = get_news(104,i)
        global_keyword = extract_keyword(global_news)
        global_dataframe[i] = global_keyword
        print(i, time.time()-start)
    return global_dataframe

In [None]:
global_keywords().to_csv('./global_0406.csv', sep=',', encoding = "utf-8")

In [None]:
def science_keywords():
    science_dataframe = pd.DataFrame()
    date_index = pd.date_range(start = '20200120', end = '20200406')
    date_list = date_index.strftime("%Y%m%d").tolist()
    for i in date_list:
        start = time.time()
        science_news = get_news(105,i)
        science_keyword = extract_keyword(science_news)
        science_dataframe[i] = science_keyword
        print(i, time.time()-start)
    return science_dataframe

In [None]:
science_keywords().to_csv('./science_0406.csv', sep=',', encoding = "utf-8")

# 2) 가중치 range(30,0,-1)에서 views/10000으로 바꿨을 때

In [None]:
import operator
def extract_keyword_(df):
    total_dict = {}
    index = 0
    for content in df['Content']:
        # 각 기사의 키워드 counting순으로 5개 추출
        tokens = komoran.morphs(content)
        tokens = [token for token in tokens if len(token) > 1 and token not in stop_words]
        d = {token : content.count(token) for token in tokens}
        d = dict(sorted(d.items(), key = operator.itemgetter(1), reverse = True)[:5])
        
        # 랭킹별로 각 기사의 키워드에 가중치 곱하고 total_dict에 합치기
        for key in d.keys():
            d[key] = d[key] * int((df.loc[index, 'Views']/10000))
            total_dict[key] = total_dict.get(key, 0) + d[key]
            
        index = index + 1
        
    total_dict = sorted(total_dict.items(), key = operator.itemgetter(1), reverse = True)[:10]
    return total_dict

# 3) 조회수 표준편차 구하기

# Reading csv File (0120 ~ 0406)

In [None]:
politics = pd.read_csv("./politics_0406.csv")
economic = pd.read_csv("./economic_0406.csv")
society = pd.read_csv("./society_0406.csv")
culture = pd.read_csv("./culture_0406.csv")
global_ = pd.read_csv("./global_0406.csv")
science = pd.read_csv("./science_0406.csv")

# Word Embedding 실험해보기

In [None]:
politics = get_news(100, 20200218)

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
word_list = []
for content in politics['Content']:
    # 각 기사의 키워드 counting순으로 5개 추출
    tokens = komoran.morphs(content)
    tokens = [token for token in tokens if len(token) > 1 and token not in stop_words]
    word_list.append(tokens)

In [None]:
embedding_model = Word2Vec(word_list, size=100, window = 5, min_count=2, workers=3, iter=1000, sg=1, sample=1e-3)

In [None]:
embedding_model.wv.most_similar('19')

# '코로나' keyword 그래프 그리기

In [None]:
culture_0123 = get_news(103, 20200123)
culture_0123

In [None]:
extract_keyword(culture_0123)

In [None]:
culture_0123.loc[4, 'Content']

In [None]:
culture_0123.loc[4, 'LinkSrc']

# Tf-Idf

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer() # TF-IDF 객체선언

# 판다스 라이브러리 임포트
import pandas as pd
# plotly express에 내장되어있는 gapminder 데이터프레임을 사용하겠습니다.
import plotly.express as px

In [None]:
# text 정제하기
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스 오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    return cleaned_text

In [15]:
# 주요뉴스 헤드라인, 조회수, 내용, 링크 크롤링
def get_news(sectionid, date):
    url = "https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=" + str(sectionid) + "&date=" + str(date)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    ranking_text = soup.find_all(class_ = 'ranking_text')
    
    l = []
    for item in ranking_text:
        d = {}
        d['LinkSrc'] = item.find('a')['href']
        d['Title'] = item.find('a')['title']
        d['Views'] = item.find(class_ = "ranking_view").get_text()
        l.append(d)
    
    for link in l:
        resp = requests.get("http://news.naver.com" + link['LinkSrc'])
        soup = BeautifulSoup(resp.text, "html.parser")
        content = soup.find(id="articleBodyContents")
        link['Content'] = clean_text(content)
        
    df = pd.DataFrame(l)
    return df

In [14]:
politic = get_news('100', '20200403')
economic = get_news('101', '20200403')
society = get_news('102', '20200220')

In [None]:
komoran = Komoran()
stop_words = ['에서', '으로', '도록', '기자', '다는', '지만', '라고', '남은', '습니다', '헤럴드', '의원', '대표', '뉴스룸', '뉴스', '서울경제', '뉴시스', '이다']

In [17]:
politic_content = politic['Content']

In [30]:
def textizing(series):
    l = list()
    for content in series:
        tokens = komoran.morphs(content)#리스트형태로 반환
        tokens = [token for token in tokens if len(token) > 1 and token not in stop_words]
        gisa = ''
        for i in tokens:
            gisa= gisa + i + ' '
        l.append(gisa)
    return l

In [None]:
def tf_idf(series):
    l = list()
    for content in series:
        tokens = komoran.morphs(content)#리스트형태로 반환
        tokens = [token for token in tokens if len(token) > 1 and token not in stop_words]
        gisa = ''
        for i in tokens:
            gisa= gisa + i + ' '
        l.append(gisa)
    tfidf_vectorizer.fit(politic_text) # 단어를 학습시킴 
    tfidf_vectorizer.vocabulary_ # 단어사전을 출력 
    tfidf = sorted(tfidf_vectorizer.vocabulary_.items(), reverse = True)
    return tfidf

In [31]:
politic_text = textizing(politic_content)

In [33]:
tfidf_vectorizer.fit(politic_text) # 단어를 학습시킴 
tfidf_vectorizer.vocabulary_ # 단어사전을 출력 
sorted(tfidf_vectorizer.vocabulary_.items(), reverse = True) # 단어사전 정렬

[('힘쓰', 2380),
 ('힘들', 2379),
 ('힐난', 2378),
 ('희한', 2377),
 ('희망', 2376),
 ('희롱', 2375),
 ('흘리자美', 2374),
 ('흘리', 2373),
 ('흔히', 2372),
 ('흔들리', 2371),
 ('흔들', 2370),
 ('흑석동', 2369),
 ('흐르', 2368),
 ('휴직', 2367),
 ('휴대폰', 2366),
 ('휴대', 2365),
 ('휩싸이', 2364),
 ('훈훈', 2363),
 ('훈련', 2362),
 ('후회', 2361),
 ('후보자', 2360),
 ('후보', 2359),
 ('효과', 2358),
 ('횡포', 2357),
 ('횡단보도', 2356),
 ('회항', 2355),
 ('회자', 2354),
 ('회의', 2353),
 ('회원', 2352),
 ('회동', 2351),
 ('회견장', 2350),
 ('황교안', 2349),
 ('활용', 2348),
 ('활동', 2347),
 ('환호', 2346),
 ('환자', 2345),
 ('환승역', 2344),
 ('환상', 2343),
 ('환경', 2342),
 ('확진', 2341),
 ('확정', 2340),
 ('확인', 2339),
 ('확실히', 2338),
 ('확산', 2337),
 ('확보', 2336),
 ('확대', 2335),
 ('화천', 2334),
 ('화질', 2333),
 ('화이팅', 2332),
 ('화영', 2331),
 ('화상', 2330),
 ('화면', 2329),
 ('화력', 2328),
 ('화두', 2327),
 ('홍보', 2326),
 ('홍두', 2325),
 ('홈페이지', 2324),
 ('혼란', 2323),
 ('혼돈', 2322),
 ('호화판', 2321),
 ('호혜', 2320),
 ('호진', 2319),
 ('호응', 2318),
 ('호소', 2317),
 ('호남', 2316),
 ('호기심',