## Beautifulsoup

In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2" data01="title_01">Lacie</a>
     and
    <a href="http://example.com/tillie" class="sister" id="link3" data02="title_02">Tillie</a>;
    and they lived at the bottom of a well.
</p>

<p class="story">...</p>
</body>
</html>
"""


In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>
     and
    <a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body>
</html>

In [4]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
    and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [5]:
# <title> 가져오기

soup.title

<title>The Dormouse's story</title>

In [6]:
soup.title.parent.name

'head'

In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.p['class']

['title']

### find_all(), find()
- 태그를 이용하여 검색

#### find()

In [9]:
html2 = soup.find('p')
html2

<p class="title"><b>The Dormouse's story</b></p>

In [10]:
html2 = soup.find('p',class_='title')
html2

<p class="title"><b>The Dormouse's story</b></p>

In [11]:
html2 = soup.find('p',class_='story')
html2

<p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>
     and
    <a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.
</p>

In [12]:
html2 = soup.find('a', id='link1')
html2

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [14]:
html2 = soup.find('a', attrs={'data01':'title_01'})
html2

<a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>

In [15]:
html2 = soup.find('a', attrs={'class':'sister','data02':'title_02'})
html2

<a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">Tillie</a>

#### find_all()

In [16]:
html2 = soup.find_all('p')
html2

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
     <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
     <a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>
      and
     <a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">Tillie</a>;
     and they lived at the bottom of a well.
 </p>,
 <p class="story">...</p>]

In [17]:
html2 = soup.find_all('p')
for html in html2:
    print(html)
    print('--------')

<p class="title"><b>The Dormouse's story</b></p>
--------
<p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>
     and
    <a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.
</p>
--------
<p class="story">...</p>
--------


In [18]:
html2 = soup.find_all('p','story')
for html in html2:
    print(html)
    print('--------')

<p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>
     and
    <a class="sister" data02="title_02" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.
</p>
--------
<p class="story">...</p>
--------


In [19]:
html2 = soup.find_all('a',id='link1')
for html in html2:
    print(html)
    print('--------')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
--------


### select(), select_one()
- CSS 선택자를 이용해 검색

#### select_one()

In [20]:
html2 = soup.select_one('p')
html2

<p class="title"><b>The Dormouse's story</b></p>

In [21]:
html2 = soup.select_one('p.title')
html2

<p class="title"><b>The Dormouse's story</b></p>

In [22]:
html2 = soup.select_one('a#link1')
html2

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [23]:
html2 = soup.select_one('a[data01="title_01"]')
html2

<a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>

In [24]:
html2 = soup.select_one('a[data01]')
html2

<a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>

In [25]:
soup.select_one('html head title')

<title>The Dormouse's story</title>

In [26]:
soup.select_one('p.story> a#link2')

<a class="sister" data01="title_01" href="http://example.com/lacie" id="link2">Lacie</a>

#### select()

In [28]:
html2 = soup.select("p.title")
for html in html2 : 
    print(html)

<p class="title"><b>The Dormouse's story</b></p>


In [29]:
html2 = soup.select_one('a#link1')
html2.text

'Elsie'

In [30]:
html2['class']

['sister']