# 웹 스크레이핑을 위한 기본 지식

### 웹 페이지의 HTML 소스 갖고 오기

In [1]:
import requests

In [3]:
url = "https://www.naver.com"
html = requests.get(url)
html

<Response [200]>

In [7]:
print(html.text[0:1000])

<!doctype html>
















<html lang="ko">
<head>
<meta charset="utf-8">
<meta name="Referrer" content="origin">
<meta http-equiv="Content-Script-Type" content="text/javascript">
<meta http-equiv="Content-Style-Type" content="text/css">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=1100">
<meta name="apple-mobile-web-app-title" content="NAVER" />
<meta name="robots" content="index,nofollow"/>
<meta name="description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요"/>
<meta property="og:title" content="네이버">
<meta property="og:url" content="https://www.naver.com/">
<meta property="og:image" content="https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png">
<meta property="og:description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요"/>
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="">
<meta name="twitter:url" content="https://www.naver.com/">
<meta name="twitter:image" content="https://s.ps

In [5]:
type(html)

requests.models.Response

In [6]:
type(html.text)

str

### 웹 페이지의 HTML 소스 갖고 오기

#### 데이터 찾고 추출하기

In [9]:
from bs4 import BeautifulSoup

html = """
<html>
<body>
<div>
<span>
    <a href=http://www.naver.com>네이버</a>
    <a href=http://www.google.com>구글</a>
    <a href=http://www.daum.net>다음</a>
</span>
</div></body></html>
""" 

In [10]:
soup = BeautifulSoup(html, 'lxml')
soup

<html>
<body>
<div>
<span>
<a href="http://www.naver.com">네이버</a>
<a href="http://www.google.com">구글</a>
<a href="http://www.daum.net">다음</a>
</span>
</div></body></html>

In [14]:
print(soup.prettify())

<html>
 <body>
  <div>
   <span>
    <a href="http://www.naver.com">
     네이버
    </a>
    <a href="http://www.google.com">
     구글
    </a>
    <a href="http://www.daum.net">
     다음
    </a>
   </span>
  </div>
 </body>
</html>



In [15]:
soup.find('a')

<a href="http://www.naver.com">네이버</a>

In [16]:
soup.find('a').get_text()


'네이버'

In [17]:
soup.find_all('a')


[<a href="http://www.naver.com">네이버</a>,
 <a href="http://www.google.com">구글</a>,
 <a href="http://www.daum.net">다음</a>]

In [18]:
site_names = soup.find_all('a')
site_names

[<a href="http://www.naver.com">네이버</a>,
 <a href="http://www.google.com">구글</a>,
 <a href="http://www.daum.net">다음</a>]

In [19]:
for site_name in site_names:
    print(site_name.get_text())


네이버
구글
다음


In [20]:
from bs4 import BeautifulSoup

# 테스트용 HTML 코드
html2 = """
<html>
 <head>
  <title>작품과 작가 모음</title>
 </head>
 <body>
  <h1>책 정보</h1>
  <p class="booknm">역사의역사</p>
  <p class="author">유시민</p>
  
  <p class="booknm">에디톨로지</p>
  <p class="author">김정운</p>

  <p class="booknm">데이터의 보이지 않는손</p>
  <p class="author">야노가즈오</p>
 </body>
</html>
""" 

In [21]:
soup2 = BeautifulSoup(html2, 'lxml')
soup2.body

<body>
<h1>책 정보</h1>
<p class="booknm">역사의역사</p>
<p class="author">유시민</p>
<p class="booknm">에디톨로지</p>
<p class="author">김정운</p>
<p class="booknm">데이터의 보이지 않는손</p>
<p class="author">야노가즈오</p>
</body>

In [22]:
soup2.title

<title>작품과 작가 모음</title>

In [23]:
soup2.body.h1

<h1>책 정보</h1>

In [24]:
soup2.body.p

<p class="booknm">역사의역사</p>

In [25]:
soup2.find('p')

<p class="booknm">역사의역사</p>

In [26]:
soup2.find_all('p')

[<p class="booknm">역사의역사</p>,
 <p class="author">유시민</p>,
 <p class="booknm">에디톨로지</p>,
 <p class="author">김정운</p>,
 <p class="booknm">데이터의 보이지 않는손</p>,
 <p class="author">야노가즈오</p>]

In [28]:
soup2.find('p', 'booknm')

<p class="booknm">역사의역사</p>

In [29]:
soup2.find('p', 'author')

<p class="author">유시민</p>

In [30]:
soup2.find_all('p', 'booknm')

[<p class="booknm">역사의역사</p>,
 <p class="booknm">에디톨로지</p>,
 <p class="booknm">데이터의 보이지 않는손</p>]

In [31]:
soup2.find_all('p', 'author')

[<p class="author">유시민</p>,
 <p class="author">김정운</p>,
 <p class="author">야노가즈오</p>]

In [32]:
from bs4 import BeautifulSoup

soup2 = BeautifulSoup(html2, "lxml")

book_names   = soup2.find_all('p', 'booknm')
book_authors = soup2.find_all('p', 'author')

for b_names, b_authors in zip(book_names, book_authors):
    print(' ♣ {b_n} / {b_a} '.format(
        b_n = b_names.get_text(), 
        b_a = b_authors.get_text()
    ))

 ♣ 역사의역사 / 유시민 
 ♣ 에디톨로지 / 김정운 
 ♣ 데이터의 보이지 않는손 / 야노가즈오 
