In [1]:
from bs4 import BeautifulSoup as bs

In [2]:
from selenium import webdriver

In [3]:
soup = bs("<p>Some<b>bad<i>HTML")
soup

<html><body><p>Some<b>bad<i>HTML</i></b></p></body></html>

In [4]:
print(soup.prettify())

<html>
 <body>
  <p>
   Some
   <b>
    bad
    <i>
     HTML
    </i>
   </b>
  </p>
 </body>
</html>


In [6]:
# soup이라는 HTML 속에서 bad라는 단어를 찾아주세요

soup.find(text='bad')

# 나중에는 bad보다 상위, 하위 항목을 찾을수도 있음

'bad'

In [7]:
# i태그로 감싸져 있는 데이터를 찾기
# i태그가 여러개일 수 도 있음, 이러한 경우는 내가 지정해서 불러와야 함
soup.i

<i>HTML</i>

In [9]:
soup = bs("<tag1>Some<tag2/>bad<tag3>XML",'xml')
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<tag1>
 Some
 <tag2/>
 bad
 <tag3>
  XML
 </tag3>
</tag1>


# Quick Start

In [10]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [12]:
len(html_doc)

492

In [13]:
type(html_doc)

str

In [11]:
soup = bs(html_doc, 'html.parser')
# 'html.parser는 안써도 구동은 됨 / parser의 종류를 지정해주는 듯 / defalt값은 html'

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


# 데이터 구조를 파악하기 위한 간단한 방법

In [14]:
# title태그로 쌓여져 있는 데이터 찾아보기

soup.title

<title>The Dormouse's story</title>

In [15]:
# title의 태그이름을 알 수 있음

soup.title.name

'title'

In [16]:
# title 태그로 둘러쌓여있는 str값을 알 수 있음

soup.title.string

"The Dormouse's story"

In [17]:
# title의 부모속성 (상위항목)을 알 수 있음

soup.title.parent.name

'head'

In [18]:
# p 태그를 불러와라~
# 근데 처음으로 나오는 p태그를 호출함

soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [19]:
#p 태그에서 class의 이름을 불로오기

soup.p['class']

['title']

In [20]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [23]:
# a태그가 들어가는 모든 정보를 호출
result=soup.find_all('a')

In [24]:
result

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [25]:
len(result)

3

In [26]:
result[0]

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [31]:
for idx ,res in enumerate(result):
    print(idx,' : ', res)

0  :  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
1  :  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
2  :  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [22]:
# id가 link3인 데이터를 호출

soup.find(id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [32]:
# 첫번째 a 태그 데이터가 호출

soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

## a태그 데이터정보중 href(링크) 데이터를 호출하기

In [33]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [34]:
# 태그 안에 쌓여있는 텍스트 정보들 만을 호출

print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



# 오브젝트 종류

#### 태그

In [55]:
soup = bs('<b class="boldest">Extremely bold</b>')
tag = soup.b
tag

<b class="boldest">Extremely bold</b>

#### 이름

In [56]:
tag.name

'b'

In [57]:
# 태그 이름 변경

tag.name = "blockquote"
tag

<blockquote class="boldest">Extremely bold</blockquote>

#### 태그아래 속성

In [58]:
tag['class']

['boldest']

In [59]:
tag.attrs

{'class': ['boldest']}

In [60]:
tag['class']='verybold'
tag

<blockquote class="verybold">Extremely bold</blockquote>

In [61]:
tag['another-attribute']=1
tag

<blockquote another-attribute="1" class="verybold">Extremely bold</blockquote>

In [62]:
del tag['class']
tag

<blockquote another-attribute="1">Extremely bold</blockquote>

In [63]:
del tag['another-attribute']

In [64]:
tag

<blockquote>Extremely bold</blockquote>

#### 다중 속성 값

In [66]:
css_soup = bs('<p class="body"></p>')
css_soup.p['class']

['body']

In [67]:
css_soup = bs('<p class="body strikeout"></p>')
css_soup.p['class']

['body', 'strikeout']

In [68]:
id_soup = bs('<p id="my id"></p>')
id_soup.p['id']

'my id'

In [70]:
rel_soup = bs('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']

['index']

In [71]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


In [72]:
class_is_multi= { '*' : 'class'}
xml_soup = bs('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi)
xml_soup.p['class']

['body', 'strikeout']

# Navigating the tree

In [73]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = bs(html_doc, 'html.parser')

In [74]:
soup.head

<head><title>The Dormouse's story</title></head>

In [75]:
soup.title

<title>The Dormouse's story</title>

In [76]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [79]:
soup.body.b

<b>The Dormouse's story</b>

In [80]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [81]:
# 안의 내용만 뽑아내는 과정

head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [82]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [83]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [84]:
title_tag.contents

# 밑처럼 순수 텍스트데이터가 우리한테 필요한 것

["The Dormouse's story"]

In [85]:
# 다른방법

for child in title_tag.children:
    print(child)

The Dormouse's story


In [86]:
title_tag.string

"The Dormouse's story"

In [87]:
# 몰라 우선 옮겨놔 모르겠어 

for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [88]:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [90]:
link = soup.a
link

for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


In [91]:
# b로 시작하는 모든 태그를 호출

import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)


body
b


In [92]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title


## find_all()

In [93]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [94]:
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [95]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [96]:
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [97]:
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

In [100]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [105]:
data_soup = bs('<div data-foo="value">foo!</div>')
#data_soup.find_all(data-foo="value") 틀림 이케말고 밑에처럼

In [106]:
data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

In [99]:
soup.find_all("a", class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [108]:
css_soup = bs('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")

[<p class="body strikeout"></p>]

In [109]:
css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [110]:
## find_all()과 비슷하나 다르게 구동된다 하지만 결과는 같다

css_soup.select("p.strikeout.body")

[<p class="body strikeout"></p>]