# 3. BeautifulSoup 기초
* HTML과 XML 문서를 파싱하기 위한 파이썬 패키지
* https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# anaconda beautifulsoup으로 검색
# pip install bs4
# conda install beautifulsoup4

In [3]:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')  # html.parser(default), lxml/html5lib(추가설치 필요)
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [5]:
type(soup)

bs4.BeautifulSoup

In [6]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


### find 함수
* 조건에 만족하는 첫번째 tag만 검색

In [7]:
soup.find('p')

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
type(soup.find('p'))

bs4.element.Tag

In [9]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [10]:
soup.find('a', id='link2')

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [11]:
soup.find('a', class_='sister', id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [12]:
attrs = {'class': 'sister', 'id':'link3'}
soup.find('a', attrs=attrs)

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

### find_all 함수
* 조건에 맞는 모든 tag를 리스트로 반환

In [13]:
soup.find('p')

<p class="title"><b>The Dormouse's story</b></p>

In [14]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [15]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [16]:
for tag in soup.find_all('a'):
    print(tag, type(tag))

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> <class 'bs4.element.Tag'>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> <class 'bs4.element.Tag'>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> <class 'bs4.element.Tag'>


In [17]:
for tag in soup.find_all('a'):
    print(tag['href'])

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


### get_text 함수
* tag안의 value를 추출
* 부모 tag의 경우 모든 자식 tag의 value를 추출

In [18]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [19]:
soup.get_text()

"The Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [20]:
soup.find('p').get_text()

"The Dormouse's story"

In [21]:
for tag in soup.find_all('a'):
    print(tag.get_text())

Elsie
Lacie
Tillie


### attribute값 추출하기
* 검색한 tag에서 attribute값을 추출
* tag['attr명']

In [22]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [23]:
soup.find('p').attrs

{'class': ['title']}

In [24]:
# list로 나오는 이유는 multi-value가 가능하므로
# 예) <p class="title abc def">
soup.find('p')['class']

['title']

In [25]:
for tag in soup.find_all('a'):
    print(tag['href'], tag['id'])

http://example.com/elsie link1
http://example.com/lacie link2
http://example.com/tillie link3


### select 함수
* select는 CSS selector로 tag 찾기
* 자손 태그 찾기 - 자손 관계 (tag tag)
* 자식 태그 찾기 - 다이렉트 자식 관계 (tag > tag)
* 아이디 찾기 #id
* 클래스 찾기 .class
* 속성값 찾기 [name='test']
    * 속성값 prefix 찾기 [name ^='test']
    * 속성값 suffix 찾기 [name $='test']
    * 속성값 substring 찾기 [name *='test]

In [26]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [27]:
soup.select('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [28]:
soup.select_one('p')

<p class="title"><b>The Dormouse's story</b></p>

In [29]:
# 자손 태그
soup.select('html title')

[<title>The Dormouse's story</title>]

In [30]:
# 직계 자식 태그
soup.select('html > title')

[]

In [31]:
# id 선택자
soup.select('#link1')  # soup.select('div#upper')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [32]:
soup.select('a#link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [33]:
# class 선택자
soup.select('.sister')  # soup.select('div.test')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [34]:
soup.select('a.sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [35]:
# 속성값 찾기
soup.select('a[href="http://example.com/lacie"]')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [36]:
soup.select('[href="http://example.com/lacie"]')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [37]:
# 시작문자열
soup.select('a[href^="http"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [38]:
# 종료문자열
soup.select('a[href$="ie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [39]:
# 포함문자열
soup.select('a[href*="example.com"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [40]:
data = """<html><body>
<a href="http://dimigo.in/menu1" class="menu">시리얼</a>
<a href="http://dimigo.in/menu2" class="menu">스파게티</a>
<a href="http://dimigo.in/menu3" class="menu">부대찌개</a>
</body></html>
"""

soup = BeautifulSoup(data, 'html.parser')  # html.parser(default), lxml/html5lib(추가설치 필요)
soup

<html><body>
<a class="menu" href="http://dimigo.in/menu1">시리얼</a>
<a class="menu" href="http://dimigo.in/menu2">스파게티</a>
<a class="menu" href="http://dimigo.in/menu3">부대찌개</a>
</body></html>

In [52]:
for m in soup.find_all('a'):
    print(m.text, m['href'])

시리얼 http://dimigo.in/menu1
스파게티 http://dimigo.in/menu2
부대찌개 http://dimigo.in/menu3


In [58]:
for m in soup.select('a#menu'):
    print(m.text, m['href'])

In [43]:
data = '''
<html><body>
<a href="http://dimigo.in/menu1" class="menu">시리얼</a>
<a href="http://dimigo.in/menu2" class="menu">스파게티</a>
<a href="http://dimigo.in/menu3" class="menu">부대찌개</a>
</body></html>
'''

In [44]:
soup = BeautifulSoup(data)
soup

<html><body>
<a class="menu" href="http://dimigo.in/menu1">시리얼</a>
<a class="menu" href="http://dimigo.in/menu2">스파게티</a>
<a class="menu" href="http://dimigo.in/menu3">부대찌개</a>
</body></html>

In [45]:
for m in soup.find_all('a'):
    print(m.get_text(), m['href'])

시리얼 http://dimigo.in/menu1
스파게티 http://dimigo.in/menu2
부대찌개 http://dimigo.in/menu3


In [46]:
for m in soup.select('a'):
    print(m.get_text(), m['href'])

시리얼 http://dimigo.in/menu1
스파게티 http://dimigo.in/menu2
부대찌개 http://dimigo.in/menu3
