# For Advanced HTML Parser
몇 가지 예시를 통해 고급 HTML 분석을 해보자

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
html = urlopen("http://pythonscraping.com/pages/warandpeace.html")
bs = BeautifulSoup(html, "html.parser")

In [3]:
nameList = bs.findAll("span", {"class":"green"})
for name in nameList:
    print(name.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


- get_text()는 모든 태그를 제거하고 텍스트만 들어있는 문자열을 반환
- 따라서, 최종 데이터를 출력, 저장, 조작하기 직전에만 사용

## find(), findAll()
- find(tag, attributes, recursive, text, keywords)
- findAll(tag, attributes, recursive, text, limit, keywords)

In [4]:
nameList = bs.findAll(text="the prince")
print(nameList)

['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']


- class 속성으로 요소를 검색할 때, class는 파이썬 키워드이기 때문에 긁어올 수 없다
- 이런 경우에는 아래와 같이 해결할 것

In [5]:
# double quote
bs.findAll("", {"class":"green"})

# underscore
allText = bs.findAll(class_="green")
print(allText[0].get_text())

Anna
Pavlovna Scherer


## Tree - children, siblings

In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [7]:
html = urlopen("http://pythonscraping.com/pages/page3.html")
bs = BeautifulSoup(html, "html.parser")

In [8]:
for child in bs.find("table", {"id":"giftList"}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg">
</img></td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg">
</img></td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/im

테이블에 타이틀 행이 있는 경우 next_siblings() 함수가 유용

In [9]:
for sibling in bs.find("table", {"id":"giftList"}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg">
</img></td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg">
</img></td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg">
</img></td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
Thi

## parent, parents

In [10]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [11]:
html = urlopen("http://pythonscraping.com/pages/page3.html")
bs = BeautifulSoup(html, "html.parser")

In [12]:
print(bs.find("img", {"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())


$15.00



# Regex with BeautifulSoup

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [2]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs = BeautifulSoup(html, "html.parser")

In [3]:
images = bs.findAll("img", {"src": re.compile("\.\.\/img\/gifts/img.*\.jpg")})

In [4]:
for image in images:
    print(image["src"])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg
