# 네이버 api 사용

In [21]:
import os
import sys
import urllib.request
import json
import datetime
import pandas as pd
from tqdm import tqdm


def naver_api_key():
    naver_keys = json.load(open("./data/secrets.json", "r"))["naver"]
    client_id = naver_keys["client_id"]
    client_secret = naver_keys["client_secret"]
    
    return client_id, client_secret


def naver_search_api(kind, keyword):
    titles, summaries, links = [], [], []
    
    client_id, client_secret = naver_api_key()
    
    search = urllib.parse.quote(keyword)
    
    for i in [1, 100]:
        url = f"https://openapi.naver.com/v1/search/{kind}?query={search}&start={i}&display=100"
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id", client_id)
        request.add_header("X-Naver-Client-Secret", client_secret)
        response = urllib.request.urlopen(request)
        rescode = response.getcode()
        
        if rescode == 200:
            content = json.loads(response.read().decode("utf-8"))
            for item in content["items"]:
                titles.append(item["title"])
                summaries.append(item["description"])
                links.append(item["link"])
        else:
            print("Error Code:" + rescode)
    
    print("="*10, kind, "="*10)
    print("titles: ", len(titles))
    print("links: ", len(links))
    print("summaries:", len(summaries))
    
    df = pd.DataFrame({"title": titles, "link": links, "summary": summaries})
    df = df.drop_duplicates(keep="first", ignore_index=True)
    print("중복 제거 후 행 개수: ", len(df))
    
    now = datetime.datetime.now()
    df.to_csv("./data/temp/링크 {}_{}_{}.csv".format(kind, keyword, now.strftime("%Y년%m월%d일_%H시%M분%S초")), 
              encoding="utf-8-sig", index=False)

In [22]:
kind_list = ["blog", "cafearticle", "news"]
keyword = "킥보드 주차"

for kind in kind_list:
    naver_search_api(kind, keyword)

titles:  200
links:  200
summaries: 200
중복 제거 후 행 개수:  199
titles:  200
links:  200
summaries: 200
중복 제거 후 행 개수:  199
titles:  200
links:  200
summaries: 200
중복 제거 후 행 개수:  199


# 링크별 본문 추출

In [23]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re


In [25]:
pd.read_csv("./data/temp/링크 blog_킥보드 주차_2023년01월30일_20시12분44초.csv")["link"]

0          https://blog.naver.com/cys0523/222680596165
1      https://blog.naver.com/dnjstpdms24/222347780173
2        https://blog.naver.com/leehayaon/222358385024
3        https://blog.naver.com/tech-plus/222680723022
4       https://blog.naver.com/arini00101/222851266281
                            ...                       
194       https://blog.naver.com/gbom6600/222965903493
195        https://blog.naver.com/jch4512/222589117443
196       https://blog.naver.com/kkalok00/222950839631
197        https://blog.naver.com/gonygo3/222680473126
198      https://blog.naver.com/mufox0425/222763699475
Name: link, Length: 199, dtype: object

In [39]:
url = "https://blog.naver.com/cys0523/222680596165"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}


raw_html = requests.get(url, headers=headers)
html = BeautifulSoup(raw_html.text, "html.parser")
# html.find("div", "se-main-container")
html


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="ko">
<head>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="-1" http-equiv="Expires"/>
<meta content="noindex,follow" name="robots"/>
<meta content="always" name="referrer"/>
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<link href="/favicon.ico?3" rel="shortcut icon" type="image/x-icon"/>
<link href="https://rss.blog.naver.com/cys0523.xml" rel="alternate" title="RSS feed for cys0523 Blog" type="application/rss+xml"/>
<link href="https://blog.naver.com/NBlogWlwLayout.naver?blogId=cys0523" rel="wlwmanifest" type="application/wlwmanifest+xml"/>
<title>경제적 자유를 꿈꾸는 짠테크맘 이야기 : 네이버 블로그</title>
</head>
<script charset="UTF-8" src="https://ssl.pstatic.net/t.static.blog/mylog/versioning/Frameset-347491577_https.js" type="text/javascript"></script>
<script charset