In [None]:
!pip install beautifulsoup4

In [1]:
from bs4 import BeautifulSoup

In [2]:
# Test html 
html = '''
<html>
    <head></head>
    <body>
        <div id = 'result'>
           <p class = 'row'>
               <a class = 'red'>Go to page1</a>
               <a class = 'blue'>Go to page2</a>
            </p>
        </div>
    </body>
</html>
'''

dom = BeautifulSoup(html, 'lxml')

In [3]:
type(dom.html.head), type(dom.html)

(bs4.element.Tag, bs4.element.Tag)

In [4]:
dom.a, dom.find('a')

(<a class="red">Go to page1</a>, <a class="red">Go to page1</a>)

In [5]:
for tag in dom.find_all('a'):
    print(tag.name, tag['class'], tag.text)

a ['red'] Go to page1
a ['blue'] Go to page2


In [6]:
dom.find_all('a', {'class':'blue'})

[<a class="blue">Go to page2</a>]

In [7]:
dom.find_all(['div', 'a'])

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>, <a class="red">Go to page1</a>, <a class="blue">Go to page2</a>]

In [11]:
dom.find_all('div', {'id':'result'})

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>]

In [11]:
dom.find_all('', {'id':'result'})

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>]

# http://pythonscraping.com/pages/page3.html (test url)

In [12]:
from urllib import parse
import requests

In [13]:
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

In [14]:
def getDownload(url, params={}, retries=3):
    resp = None
    
    try:
        resp = requests.get(url, params=params, headers=header)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and retries > 0:
            print(retries)
            resp = getDownload(url, params, retries-1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
            
    return resp

In [18]:
# html text
url = 'http://pythonscraping.com/pages/page3.html'
html = getDownload(url)
dom = BeautifulSoup(html.text, 'lxml')
footer = dom.find('div', {'id':'footer'})

In [20]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [24]:
# children tag의 하부 tree 뿐만 아니라 형제 tree의 하부까지 모두 찾을 수 있다.
children = parent.find_all()
# children

In [26]:
# recursive=False 지정하면 원래 children tag의 하부 tree 만 찾는다.
children = parent.find_all(recursive=False)
for row in children:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [27]:
descendants = parent.find_all()
for row in descendants:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
p {}
br {}
br {}
table {'id': 'giftList'}
tr {}
th {}
th {}
th {}
th {}
tr {'id': 'gift1', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img1.jpg'}
tr {'id': 'gift2', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img2.jpg'}
tr {'id': 'gift3', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img3.jpg'}
tr {'id': 'gift4', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img4.jpg'}
tr {'id': 'gift5', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img6.jpg'}
div {'id': 'footer'}
br {}


In [34]:
children[2]

<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>

In [35]:
children[2].find_all(recursive=False)

[<p>
 We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
 123 Main St.<br/>
 Abuja, Nigeria
 We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p>]

In [28]:
divChildren = children[2].find_all(recursive=False)
for row in divChildren:
    print(row.name, row.attrs, row.text)

p {} 
We haven't figured out how to make online shopping carts yet, but you can send us a check to:
123 Main St.
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.


In [36]:
divTag = children[2]
children[1].name, divTag.find_previous_sibling().name

('h1', 'h1')

In [37]:
# divTag.find_next_siblings() # 이웃

[<table id="giftList">
 <tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005

In [40]:
children[3] == divTag.find_next_sibling()

True

# Cost 값 가져오기 

In [42]:
aList = dom.find_all('tr')
for row in aList:
    print(row.find_all(recursive=False)[2].text.strip())

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


# Google 검색결과 실습

In [52]:
url = 'https://www.google.com/search'
params = {'q':'박보영'}
html = getDownload(url,params)

dom = BeautifulSoup(html.text, 'lxml')

for tag in dom.find_all('', {'class':'r'}):
    print(tag.find('h3').text)

박보영 - 위키백과, 우리 모두의 백과사전
박보영의 작품 목록 - 위키백과, 우리 모두의 백과사전
박보영 - 나무위키
종합 박보영이 흔녀 복합장르물 어비스 시청자 설득할까 | 한경닷컴
박보영, tvN 드라마 '어비스' 여주인공 - MSN.com
#박보영 hashtag on Twitter
박보영은 오래 지켜본다. 연애도, 연기 변신도 - 중앙일보 - 조인스
`놀라운 토요일` 놀토 열혈팬 박보영X안효섭, "귀가 좀 안 좋아요" - 스타 ...


# Nate 검색결과 실습

In [69]:
url = 'https://search.daum.net/nate?thr=sbma&w=tot&q=%EB%B0%95%EB%B3%B4%EC%98%81'
html = getDownload(url)

dom = BeautifulSoup(html.text, 'lxml')

In [63]:
ul = dom.find_all('ul', {'class':'list_info'})

In [89]:
for tag in dom.find_all('a', {"class":"f_link_b"}):
    print(tag.text.strip())

'어비스' 박보영, 심장 찢기는 오열
'어비스' 박보영-안효섭, 귀요미 커플 슈트핏 공개
'어비스' 첫방 D-1, 박보영·안효섭 커플 정장 '케미' 포착
'어비스' 박보영X안효섭, 설렘지수 높이는 키 차이
김영광 박보영 열애
박보영 실제 키는 도대체 몇일까?
박보영 나이 몸매 실제키 드라마 어비스
김영광 박보영 열애 터진 이유
박보영과 역대급 케미뽐낸 상대배우 고르기
드라마 어비스 인물 소개, 예고편(박보영, 안효섭 주연)
박보영이 왜 못 오를 나무냐는 박수홍.jpg
런닝맨 나올 때마다 케미 보여준 송지효X박보영.jpgif
박보영, 제2의 문근영이라는 호칭때문에 악플에 시달렸나요? 아역배우...
박보영, 과속스캔들 OST 중 '아마도그건' 불렀다던데, 이번에 과속...
박보영누나 과속스캔들찍고 또 다른 드라마나 영화 찍나요?> 박보영누나...
박보영 질문요. 박보영 진짜 예쁘더군요... 박보영이 대체 누군...
박보영 갤러리
박보영 공식팬카페 뽀르테
Park Bo Young V LIVE
인천방주교회


![박보영](https://user-images.githubusercontent.com/40786348/57190011-7a36f680-6f50-11e9-96ba-ae40ac767355.PNG)