In [1]:
!pip install beautifulsoup4



In [2]:
from bs4 import BeautifulSoup

In [3]:
html = '''
<html>
    <head>
        <title>예제</title>
    </head>
    <body>
        <div>
            <p>
                <a href='/page' class='d' id=asdf>페이지 이동</a>
            </p>
        </div>
    </body>
</html>
'''

In [4]:
dom = BeautifulSoup(html, 'html.parser')

In [5]:
dom


<html>
<head>
<title>예제</title>
</head>
<body>
<div>
<p>
<a class="d" href="/page" id="asdf">페이지 이동</a>
</p>
</div>
</body>
</html>

In [6]:
dom.text, dom.prettify()

('\n\n\n예제\n\n\n\n\n페이지 이동\n\n\n\n\n',
 '<html>\n <head>\n  <title>\n   예제\n  </title>\n </head>\n <body>\n  <div>\n   <p>\n    <a class="d" href="/page" id="asdf">\n     페이지 이동\n    </a>\n   </p>\n  </div>\n </body>\n</html>\n')

In [7]:
dom.html.head.title.text

'예제'

In [8]:
dom.title.text # title이 하나 밖에 없으니 위와 동일

'예제'

In [9]:
dom.body.text, dom.p, dom.a

('\n\n\n페이지 이동\n\n\n',
 <p>
 <a class="d" href="/page" id="asdf">페이지 이동</a>
 </p>,
 <a class="d" href="/page" id="asdf">페이지 이동</a>)

In [10]:
for _ in list(dom.body.children):
    print(_, type(_))


 <class 'bs4.element.NavigableString'>
<div>
<p>
<a class="d" href="/page" id="asdf">페이지 이동</a>
</p>
</div> <class 'bs4.element.Tag'>

 <class 'bs4.element.NavigableString'>


In [11]:
dom.a.attrs, dom.a['href'] # 속성들을 key-value쌍으로 갖고 있다

({'href': '/page', 'class': ['d'], 'id': 'asdf'}, '/page')

In [12]:
html = '''
<html>
    <head>
        <title>예제</title>
    </head>
    <body>
        <div>
            <p>
                <a href='/page' class='d' id=asdf>페이지 이동1</a>
                <a>페이지 이동2></a>
            </p>
        </div>
        <footer>
            <a class='d'>페이지 이동3</a>
        </footer>
    </body>
</html>
'''

In [13]:
dom = BeautifulSoup(html, 'html.parser')

In [14]:
print(dom.prettify())

<html>
 <head>
  <title>
   예제
  </title>
 </head>
 <body>
  <div>
   <p>
    <a class="d" href="/page" id="asdf">
     페이지 이동1
    </a>
    <a>
     페이지 이동2&gt;
    </a>
   </p>
  </div>
  <footer>
   <a class="d">
    페이지 이동3
   </a>
  </footer>
 </body>
</html>



In [15]:
dom = BeautifulSoup(html, 'lxml')

In [16]:
print(dom.prettify())

<html>
 <head>
  <title>
   예제
  </title>
 </head>
 <body>
  <div>
   <p>
    <a class="d" href="/page" id="asdf">
     페이지 이동1
    </a>
    <a>
     페이지 이동2&gt;
    </a>
   </p>
  </div>
  <footer>
   <a class="d">
    페이지 이동3
   </a>
  </footer>
 </body>
</html>



In [17]:
# 알아서 수정해줌을 알 수 있다

In [18]:
dom.body.div.p.a # 가장 처음 나타난 해당 되는 태그 단 한 개

<a class="d" href="/page" id="asdf">페이지 이동1</a>

In [19]:
list(dom.body.div.p.children)

['\n',
 <a class="d" href="/page" id="asdf">페이지 이동1</a>,
 '\n',
 <a>페이지 이동2&gt;</a>,
 '\n']

In [20]:
for _ in list(dom.body.div.p.children):
    if _.has_attr('class'):
        print(_['class'])

AttributeError: 'NavigableString' object has no attribute 'has_attr'

In [21]:
dom.a['href'], dom.a == None, dom.asdasff == None

('/page', False, True)

In [22]:
# 문제점
# 1. 없는 태그(노드) 못 거름 -> NoneType 객체 -> Warning(잠재적 에러 요소를 갖고있음)
# 2. 특정 노드를 원할 때 -> 처음 나온 노드 하나만 반환하는 문제

In [23]:
dom.find('a'), dom.find('a')['href'], dom.find_all('a')[-1]

(<a class="d" href="/page" id="asdf">페이지 이동1</a>,
 '/page',
 <a class="d">페이지 이동3</a>)

In [24]:
dom.find('a').attrs

{'href': '/page', 'class': ['d'], 'id': 'asdf'}

In [25]:
len(dom.find_all({'p', 'a'})), dom.find_all({'p', 'a'})

(4,
 [<p>
  <a class="d" href="/page" id="asdf">페이지 이동1</a>
  <a>페이지 이동2&gt;</a>
  </p>,
  <a class="d" href="/page" id="asdf">페이지 이동1</a>,
  <a>페이지 이동2&gt;</a>,
  <a class="d">페이지 이동3</a>])

In [26]:
dom.find_all('a'), dom.find_all('a', {'id':'asdf'})

([<a class="d" href="/page" id="asdf">페이지 이동1</a>,
  <a>페이지 이동2&gt;</a>,
  <a class="d">페이지 이동3</a>],
 [<a class="d" href="/page" id="asdf">페이지 이동1</a>])

In [27]:
dom.find_all(text='페이지 이동1')

['페이지 이동1']

In [28]:
dom.find_all('a', limit=1) # limit=1 -> 딱 하나 찾음

[<a class="d" href="/page" id="asdf">페이지 이동1</a>]

In [29]:
dom.find_all('a', limit=2) # 수십수백 개 있는 a를 몇 개까지 갖고올 것인지

[<a class="d" href="/page" id="asdf">페이지 이동1</a>, <a>페이지 이동2&gt;</a>]

In [30]:
dom.find_all(attrs={'class':'d'}) # 태그 상관 없이 특정 클래스만 지칭

[<a class="d" href="/page" id="asdf">페이지 이동1</a>, <a class="d">페이지 이동3</a>]

In [31]:
dom.find_all('a', attrs={'class':'d'}) # 태그도 지칭

[<a class="d" href="/page" id="asdf">페이지 이동1</a>, <a class="d">페이지 이동3</a>]

In [32]:
html = '''
<html>
    <head>
        <title>예제</title>
    </head>
    <body>
        <div>
            <p>
                <a href='/page' class='d' id=asdf>페이지 이동1</a>
                <a>페이지 이동2></a>
            </p>
        </div>
        <footer>
            <a class='d'>페이지 이동3</a>
        </footer>
    </body>
</html>
'''

In [33]:
dom = BeautifulSoup(html, 'html.parser')

In [34]:
dom.div

<div>
<p>
<a class="d" href="/page" id="asdf">페이지 이동1</a>
<a>페이지 이동2&gt;</a>
</p>
</div>

In [35]:
dom.find_all('a'), dom.find_all('a', attrs={'class':'d'})

([<a class="d" href="/page" id="asdf">페이지 이동1</a>,
  <a>페이지 이동2&gt;</a>,
  <a class="d">페이지 이동3</a>],
 [<a class="d" href="/page" id="asdf">페이지 이동1</a>, <a class="d">페이지 이동3</a>])

In [36]:
dom.footer.find_all('a')

[<a class="d">페이지 이동3</a>]

In [37]:
dom.div.find_all('a')

[<a class="d" href="/page" id="asdf">페이지 이동1</a>, <a>페이지 이동2&gt;</a>]

In [38]:
node = dom.footer.a # 시작점

In [39]:
type(node) # 태그인지 확인

bs4.element.Tag

In [40]:
node.attrs

{'class': ['d']}

In [41]:
node.find_parent().name

'footer'

In [42]:
for _ in node.find_parents():
    print(_.name)

footer
body
html
[document]


In [43]:
for _ in dom.div.find_all(recursive=False): # 자손은 빼고 자신만
    print(_.name)

p


In [44]:
node.find_parent().find_parent().find().name

'div'

In [45]:
node.find_parent().find_parent().find().find('a').find_next_sibling()

<a>페이지 이동2&gt;</a>

In [46]:
from urllib import robotparser
from urllib.request import urlopen, Request
from urllib.robotparser import RobotFileParser
import requests
from requests.compat import urlparse, urljoin
from requests.exceptions import HTTPError
import time

In [47]:
def canfetch(url, agent='*', path='/'):
    robot = RobotFileParser(urljoin(url, 'robots.txt'))
    robot.read()
    return robot.can_fetch(agent, urlparse(url)[2])

def download(url, params={}, headers={}, method='GET', limit=3):
    if canfetch(url) == False:
        print('[Error] ' + url)
    try:
        resp = requests.request(method, url,
               params=params if method=='GET' else {},
               data=params if method=='POST' else {},
               headers=headers)
        resp.raise_for_status()
    except HTTPError as e:
        if limit > 0 and e.response.status_code >= 500:
            print(limit)
            time.sleep(1)
            resp = download(url, params, headers, method, limit-1)
        else:
            print('[{}] '.format(e.response.status_code) + url)
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
    return resp

In [48]:
resp = download('http://pythonscraping.com/pages/page3.html')
dom = BeautifulSoup(resp.content, 'lxml')

In [49]:
node = dom.find('div', {'id':'footer'})

In [50]:
for _ in node.find_parents():
    print(_.name)

div
body
html
[document]


In [51]:
node.find_parents('div')[0].find_all(recursive=False)[0] # 이미지

<img src="../img/gifts/logo.jpg" style="float:left;"/>

In [52]:
node.find_parents('div')[0].find_all(resursive=False)[0]['src'] # 이미지 주소

'../img/gifts/logo.jpg'

In [53]:
for _ in node.find_previous_siblings():
    print(_.name)

table
div
h1
img


In [54]:
node.find_previous_siblings()[-1]['src']

'../img/gifts/logo.jpg'

In [55]:
urljoin(resp.request.url, node.find_previous_siblings()[-1]['src'])

'http://pythonscraping.com/img/gifts/logo.jpg'

In [56]:
node.find_previous_siblings('table')[0].find_all(recursive=False) # 자식들

[<tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img

In [57]:
node.find_previous_siblings('table')[0].find_all(recursive=False)[3].find('img')['src']

'../img/gifts/img3.jpg'

In [58]:
dom.find('table').find_all(recursive=False) # 자식 tr들만

[<tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img

In [59]:
for _ in dom.find('table').find_all(recursive=False)[1:]:
    print(_.find_all('td', recursive=False)[2].text.strip())

$15.00
$10,000.52
$10,005.00
$0.50
$1.50


In [60]:
# 속성을 가져와보자

In [61]:
dom.find_all(attrs={'class':'gift'}) # 태그에 상관없이 어트리뷰트의 class가 gift인 애들

[<tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift4"><td>
 Dead Parrot
 </

In [62]:
for _ in dom.find_all(attrs={'class':'gift'}):
    print([td.text.strip() for td in _.find_all()])

['Vegetable Basket', 'This vegetable basket is the perfect gift for your health conscious (or overweight) friends!\nNow with super-colorful bell peppers!', 'Now with super-colorful bell peppers!', '$15.00', '', '']
['Russian Nesting Dolls', 'Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!', '8 entire dolls per set! Octuple the presents!', '$10,000.52', '', '']
['Fish Painting', "If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!", 'Also hand-painted by trained monkeys!', '$10,005.00', '', '']
['Dead Parrot', "This is an ex-parrot! Or maybe he's only resting?", "Or maybe he's only resting?", '$0.50', '', '']
['Mystery Box', 'If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!', 'Keep your friends guessing!', '$1.50', '', '

### '파이썬' 검색창에서 제목과 링크 가져오기

In [63]:
from requests import request
from requests.compat import urljoin, urlparse
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
from time import sleep

In [64]:
def canfetch(url, agent='*', path='/'):
    robot = RobotFileParser(urljoin(url, '/robots.txt'))
    robot.read()
    return robot.can_fetch(agent, urlparse(url)[2])
    
def download(url, params={}, headers={}, method='GET', limit=3):
    if canfetch(url) == False:
        print('[Error] ' + url)
#     else: # 실제 수집할 때, 제약사항이 많으므로 여기선 잠시 해제
    try:
        resp = request(method, url,
               params=params if method=='GET' else {},
               data=params if method=='POST' else {},
               headers=headers)
        resp.raise_for_status()
    except HTTPError as e:
        if limit > 0 and e.response.status_code >= 500:
            print(limit)
            time.sleep(1) # => random
            resp = download(url, params, headers, method, limit-1)
        else:
            print('[{}] '.format(e.response.status_code) + url)
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
    return resp

In [65]:
url = 'https://www.google.com/search'
params = {
    'q':'',
    'oq':'',
    'aqs':'chrome..69i57j69i59j69i65l3j69i61j69i60j69i61.1205j0j7',
    'sourceid':'chrome',
    'ie':'UTF-8'
}
params['q'] = params['oq'] = '파 이 썬'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
resp = download(url, params, headers, 'GET')
dom = BeautifulSoup(resp.text, 'html.parser')

[Error] https://www.google.com/search


In [66]:
[_.text.strip() for _ in dom.find_all('h3', {'class':'LC20lb'})]

['파이썬 자습서 — Python 3.8.5 문서',
 '1 파이썬 시작하기 - 왕초보를 위한 Python - WikiDocs',
 'Python - 나무위키',
 '파이썬 - 위키백과, 우리 모두의 백과사전',
 '파이썬의 인기는 언제까지 갈까? - Byline Network',
 '강좌: 파이썬 코딩 도장',
 '모두를 위한 프로그래밍 : 파이썬 강좌소개 : edwith']

In [67]:
for _ in dom.find_all('h3', {'class':'LC20lb'}):
    print(_.find_parents(limit=2)[-1])
    print(_.find_parents(limit=2)[-1].attrs)

<div class="r"><a href="https://docs.python.org/ko/3/tutorial/index.html" ping="/url?sa=t&amp;source=web&amp;rct=j&amp;url=https://docs.python.org/ko/3/tutorial/index.html&amp;ved=2ahUKEwj8hv_Tjf_qAhVzNKYKHWTwAPgQFjAKegQIBBAB"><br/><h3 class="LC20lb DKV0Md">파이썬 자습서 — Python 3.8.5 문서</h3><div class="TbwUpd NJjxre"><cite class="iUh30 bc tjvcx">docs.python.org<span class="eipWBe"> › tutorial</span></cite></div></a><div class="B6fmyf"><div class="TbwUpd"><cite class="iUh30 bc tjvcx">docs.python.org<span class="eipWBe"> › tutorial</span></cite></div><div class="eFM0qc"><span><div class="action-menu"><a aria-expanded="false" aria-haspopup="true" aria-label="검색결과 옵션" class="GHDvEf" data-ved="2ahUKEwj8hv_Tjf_qAhVzNKYKHWTwAPgQ7B0wCnoECAQQBA" href="#" id="am-b10" jsaction="m.tdd;keydown:m.hbke;keypress:m.mskpe" role="button"><span class="mn-dwn-arw"></span></a><ol class="action-menu-panel" data-ved="2ahUKEwj8hv_Tjf_qAhVzNKYKHWTwAPgQqR8wCnoECAQQBQ" jsaction="keydown:m.hdke;mouseover:m.hdhne;mouse

In [68]:
# 제목과 url 같이 가져오기
[(_.text.strip(), _.find_parents('a')[0]['href'])
 for _ in dom.find_all('h3', {'class':'LC20lb'})]

[('파이썬 자습서 — Python 3.8.5 문서',
  'https://docs.python.org/ko/3/tutorial/index.html'),
 ('1 파이썬 시작하기 - 왕초보를 위한 Python - WikiDocs', 'https://wikidocs.net/43'),
 ('Python - 나무위키', 'https://namu.wiki/w/Python'),
 ('파이썬 - 위키백과, 우리 모두의 백과사전',
  'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC'),
 ('파이썬의 인기는 언제까지 갈까? - Byline Network',
  'https://byline.network/2020/03/17-90/'),
 ('강좌: 파이썬 코딩 도장', 'https://dojang.io/course/view.php?id=7'),
 ('모두를 위한 프로그래밍 : 파이썬 강좌소개 : edwith',
  'https://www.edwith.org/pythonforeverybody')]

In [69]:
len(dom.find_all('div', {'class':'rc'}))

7

In [70]:
for _ in dom.find_all('div', {'class':'rc'}):
    print(_.find().find('a')['href'])
    print(_.find('a').find('h3').text.strip())

https://docs.python.org/ko/3/tutorial/index.html
파이썬 자습서 — Python 3.8.5 문서
https://wikidocs.net/43
1 파이썬 시작하기 - 왕초보를 위한 Python - WikiDocs
https://namu.wiki/w/Python
Python - 나무위키
https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC
파이썬 - 위키백과, 우리 모두의 백과사전
https://byline.network/2020/03/17-90/
파이썬의 인기는 언제까지 갈까? - Byline Network
https://dojang.io/course/view.php?id=7
강좌: 파이썬 코딩 도장
https://www.edwith.org/pythonforeverybody
모두를 위한 프로그래밍 : 파이썬 강좌소개 : edwith


In [71]:
url = 'https://www.google.com/search'
params = {
    'q':'',
    'oq':'',
    'aqs':'chrome..69i57j69i59j69i65l3j69i61j69i60j69i61.1205j0j7',
    'sourceid':'chrome',
    'ie':'UTF-8'
}
params['q'] = params['oq'] = '파이썬'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
resp = download(url, params, headers, 'GET')
dom = BeautifulSoup(resp.text, 'html.parser')
for _ in dom.find_all('h3', {'class':'LC20lb'}):
    print(_.text.strip(), _.find_parents('a')[0]['href'])

[Error] https://www.google.com/search
파이썬 - 위키백과, 우리 모두의 백과사전 https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC
파이썬 자습서 — Python 3.8.5 문서 https://docs.python.org/ko/3/tutorial/index.html
Python - 나무위키 https://namu.wiki/w/Python
1 파이썬 시작하기 - 왕초보를 위한 Python - WikiDocs https://wikidocs.net/43
Python 코딩의 기초 - 예제로 배우는 파이썬 프로그래밍 http://pythonstudy.xyz/python/article/6-Python-%EC%BD%94%EB%94%A9%EC%9D%98-%EA%B8%B0%EC%B4%88
파이썬의 인기는 언제까지 갈까? - Byline Network https://byline.network/2020/03/17-90/
파이썬 코딩 도장: 1.3 파이썬 https://dojang.io/mod/page/view.php?id=2153
파이썬 무료 강의 (기본편) - 6시간 뒤면 나도 개발자 - 인프런 https://www.inflearn.com/course/%EB%82%98%EB%8F%84%EC%BD%94%EB%94%A9-%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EA%B8%B0%EB%B3%B8


In [72]:
url = 'https://search.naver.com/search.naver'
params = {
    'sm':'top_hty',
    'fbm':0,
    'ie':'utf8',
    'query':''
}
params['query'] = '파 이 썬'
resp = download(url, params, headers, 'GET')
dom = BeautifulSoup(resp.text, 'html.parser')

[Error] https://search.naver.com/search.naver


In [73]:
for _ in dom.find_all('ul', {'class':'type01'}):
    for a in [dt.find('a') for dt in _.find_all('dt')]:
        print(a['href'], a.text)


https://www.python.org/ Welcome to Python.org
https://blog.naver.com/icd900/221318575090 국산 중형차 추천 흰 파 썬
https://blog.naver.com/kmj4956/140143459570 맛 있 는 파 이 선 물
https://blog.naver.com/xmflxhs8568?Redirect=Log&logNo=221532773930 레인보우 썬팅 전면,측.후면 12% 파.썬 30%
https://blog.naver.com/choyooney?Redirect=Log&logNo=221408338615 늦잠,티타임,GA과제,파에썬
https://blog.naver.com/sanaiauction?Redirect=Log&logNo=221407173464 2015년형 그랜저HG 240모던 흰파썬 소개합니다!
https://blog.naver.com/va85hgnmvcl?Redirect=Log&logNo=221687347280 [미얀마 만달레이 호텔] 썬 파 트래블 & 투어 인접한 더...
https://kin.naver.com/qna/detail.nhn?d1id=1&dirId=10402&docId=357557482&qb=7YyMIOydtCDsjaw=&enc=utf8§ion=kin&rank=1&search_sort=0&spq=0  파이썬 if 문에서 and로 연결하는 방법 
https://kin.naver.com/qna/detail.nhn?d1id=1&dirId=10402&docId=342332721&qb=7YyMIOydtCDsjaw=&enc=utf8§ion=kin&rank=2&search_sort=0&spq=0  파이썬 질문 ㅜㅜㅜ 
https://kin.naver.com/qna/detail.nhn?d1id=13&dirId=1303010501&docId=36135662&qb=7YyMIOydtCDsjaw=&enc=utf8§ion=kin&rank=3&search_sort=0&spq=0  메이플 법사 

In [74]:
url = 'https://search.daum.net/search'
params = {
    'w':'tot',
    'DA':'YZR',
    't__nil_searchbox':'btn',
    'sug':'',
    'sugo':'',
    'sq':'',
    'o':'',
    'q':''
}
params['q'] = '파 이 썬'
resp = download(url, params, headers, 'GET')
dom = BeautifulSoup(resp.text, 'html.parser')

[Error] https://search.daum.net/search


In [75]:
for _ in dom.find_all('div', {'class':'wrap_tit'}):
    print(_.find('a')['href'], _.find('a').text.strip())

https://keyword.ad.daum.net/short/clk?q=pzTmR4.r.eQdutMC6Q7VBrdDJlzpbPU-e3zukDClp1utpP1-moSRqpaApoIw-I_h5jhoQVnt7apJ8_AuEMGn8HQiV8xTl_NhFpoMt-RX_T5hqvLoM4-zWhvTSnVoCK2pe_hUk9NLZcW9M2YrpAxvul_RoZXihuSQjS5cAmiyvB1OafNviwr7bW1fIp_J2VDn-GKwiNLMEYuoMw00&s=http%3A%2F%2Facornedu.co.kr%2Funemployed%2Fedu_info.jsp 에이콘아카데미 공식홈페이지
https://keyword.ad.daum.net/short/clk?q=pzlBZiq-PDlrDSQi48ghS3mWfUesbvfvJZ-s-dcU2t-Ghhi6Rf8.phfGvMUZuS81STEwVFdMx-rhY7N3CI3_RBWDpEOmIVwvNtaQ8SJlUb.xiJR44GMGbwaJF3Yrye5zJrXBFdpg3-b24ZBWgOQ1CDYrSbO4lliLlGIdlhkgZXC._svm1yu7XaJr9P3CwiWrqG5zULiaF.an.tnAPsU62v4Py2mxzbjIZ2bdxswvwjdZbvHVgZZlcIPWurV1dF.tPq-mBJb6BL9a6_MJzfxW-HYNIY4xPtnSWQCtJxjuiavh.t9Ircdzr3vj7Zr1zMmS7XoNJI7-t8EXypXlmGQoFypokBusaqxC8fM0&s=https%3A%2F%2Fkmong.com%2Fcategory%2F605%3Futm_source%3Dpc_daum%26utm_medium%3Dcpc%26utm_campaign%3D6%26utm_content%3Dutmpage%26utm_term%3D%ED%8C%8C%EC%9D%B4%EC%8D%AC%26utm_group%3D605%26utm_page%3DY%26DMKW%3D%25ED%258C%258C%25EC%259D%25B4%25EC%258D%25AC%26DMSKW%3D%25ED%258C%258

#### pythonscraping.com 예제

In [76]:
resp = download('http://pythonscraping.com/pages/page3.html')
dom = BeautifulSoup(resp.text, 'lxml')

In [None]:
tag, #id, .class, .class.class.class
ul.class, ul.class.class.class
tag, tag, tag => CS
tag(id~=asdf)

In [77]:
dom.select_one('div#footer') == dom.select_one('#footer')

True

In [78]:
dom.select_one('#footer').text.strip()

'© Totally Normal Gifts, Inc. \n+234 (617) 863-0736'

In [None]:
Selector
tag1, tag2 -> 태그 2개
tag1 tag2 -> 자손 (find_all(recursive=True))
tag1 > tag2 -> 자식 (find_all(recursive=False))
tag1 + tag2 -> 형제(다음 노드) => tag2

In [79]:
dom.select_one('#footer').find_parent().name

'div'

In [80]:
len(dom.select('#wrapper > div')), len(dom.select('#wrapper > *'))

(2, 5)

In [81]:
[_.name for _ in dom.select('#wrapper > *')]

['img', 'h1', 'div', 'table', 'div']

In [82]:
dom.select_one('h1 + div').name

'div'

In [83]:
dom.select_one('h1 + div').find_previous_sibling().name

'h1'

In [84]:
dom.select_one('body > div > h1 + div').name

'div'

In [85]:
[_.text.strip() for _ in dom.select('.gift > td:nth-of-type(3)')]

['$15.00', '$10,000.52', '$10,005.00', '$0.50', '$1.50']

In [86]:
[_['src'].strip() for _ in dom.select('.gift > td > img')]

['../img/gifts/img1.jpg',
 '../img/gifts/img2.jpg',
 '../img/gifts/img3.jpg',
 '../img/gifts/img4.jpg',
 '../img/gifts/img6.jpg']

In [87]:
url = 'http://example.webscraping.com/places/default/index'
resp = download(url)
dom = BeautifulSoup(resp.content, 'html.parser')

In [88]:
for _ in dom.select('a'):
    print(_['href'])

#
/places/default/user/register?_next=/places/default/index
/places/default/user/login?_next=/places/default/index
/places/default/index
/places/default/search
/places/default/view/Afghanistan-1
/places/default/view/Aland-Islands-2
/places/default/view/Albania-3
/places/default/view/Algeria-4
/places/default/view/American-Samoa-5
/places/default/view/Andorra-6
/places/default/view/Angola-7
/places/default/view/Anguilla-8
/places/default/view/Antarctica-9
/places/default/view/Antigua-and-Barbuda-10
/places/default/index/1


In [89]:
url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
resp = download(url)
dom = BeautifulSoup(resp.content, 'html.parser')

In [90]:
for _ in dom.select('a'):
    print(_['href'])

#
/places/default/user/register?_next=/places/default/view/Afghanistan-1
/places/default/user/login?_next=/places/default/view/Afghanistan-1
/places/default/index
/places/default/search
/places/default/continent/AS
/places/default/iso/TM
/places/default/iso/CN
/places/default/iso/IR
/places/default/iso/TJ
/places/default/iso/PK
/places/default/iso/UZ
/places/default/edit/Afghanistan-1


In [91]:
url = 'http://example.webscraping.com/places/default/index'
resp = download(url)
dom = BeautifulSoup(resp.content, 'html.parser')

In [None]:
urls.pop() 꺼낼, urls.append() 추가

In [None]:
Queue => FIFO => pop(0)
Stack => LIFO => pop(-1)

In [92]:
urls = list()
seen = list()
urls.append(url)

while urls: # Queue
    seed = urls.pop(0) # starting url
    seen.append(seed) # => 재방문 회피
    dom = BeautifulSoup(download(seed).text, 'html.parser') # HTTP
#     for _ in dom.select('a'): # extract hyperlinks
#         if _.has_attr('href'): # 나중에
#             if _['href'].startswith('/'): # filter1
#                 newUrls = urljoin(seed, _['href']) # Normalization
#                 # query부분 (GET방식에서 ? 이후에 나오는 파라미터 생략)
#                 if newUrls not in seen and newUrls not in urls: # 
#                     urls.append(newUrls)
# #                     print(newUrls)
    for _ in [_['href'] for _ in dom.select('a')
              if _.has_attr('href') and _['href'].startswith('/')]:
        newUrls = urljoin(seed, urlparse(_)[2])
        if newUrls not in seen and newUrls not in urls:
            urls.append(newUrls)
    print(len(urls), len(seen))

14 1
13 2
12 3
11 4
18 5
20 6
26 7
34 8
35 9
37 10
41 11
42 12
43 13
43 14
54 15
100 16
102 17
112 18
116 19


KeyboardInterrupt: 

In [93]:
urls # 남은 것들

['http://example.webscraping.com/places/default/iso/PK',
 'http://example.webscraping.com/places/default/iso/UZ',
 'http://example.webscraping.com/places/default/edit/Afghanistan-1',
 'http://example.webscraping.com/places/default/continent/EU',
 'http://example.webscraping.com/places/default/iso//',
 'http://example.webscraping.com/places/default/edit/Aland-Islands-2',
 'http://example.webscraping.com/places/default/iso/MK',
 'http://example.webscraping.com/places/default/iso/GR',
 'http://example.webscraping.com/places/default/iso/CS',
 'http://example.webscraping.com/places/default/iso/ME',
 'http://example.webscraping.com/places/default/iso/RS',
 'http://example.webscraping.com/places/default/iso/XK',
 'http://example.webscraping.com/places/default/edit/Albania-3',
 'http://example.webscraping.com/places/default/continent/AF',
 'http://example.webscraping.com/places/default/iso/NE',
 'http://example.webscraping.com/places/default/iso/EH',
 'http://example.webscraping.com/places/def

In [94]:
urlparse('http://example.webscraping.com/places/default/user/register?_next=/places/default/view/Afghanistan-1')

ParseResult(scheme='http', netloc='example.webscraping.com', path='/places/default/user/register', params='', query='_next=/places/default/view/Afghanistan-1', fragment='')

In [95]:
# 깔끔하게 path까지만 # query 부분 생략

In [96]:
# 더 깔끔하게
urls = list()
seen = list()
urls.append(url)

while urls: # Queue
    seed = urls.pop(0) # starting url
    seen.append(seed) # -> 재방문 회피
    dom = BeautifulSoup(download(seed).text, 'html.parser') # HTTP
    for _ in [_['href'] for _ in dom.select('a')
             if _.has_attr('href') and _['href'].startswith('/')]:
        newUrls = urljoin(seed, urlparse(_)[2])
        if newUrls not in seen and newUrls not in urls:
            urls.append(newUrls)
    print(len(urls), len(seen))

14 1
13 2
12 3
11 4
18 5
20 6
26 7
34 8
35 9
37 10
41 11
42 12
43 13
43 14
54 15
100 16
102 17
112 18
116 19


KeyboardInterrupt: 