# Library import & version check

In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
from selenium import webdriver

In [4]:
requests.__version__

'2.22.0'

In [5]:
BeautifulSoup

bs4.BeautifulSoup

In [6]:
webdriver.__version__

'3.14.1'

In [7]:
import tensorflow as tf

In [8]:
tf.__version__

'2.1.0'

In [9]:
import nltk

In [10]:
nltk.__version__

'3.4.5'

In [16]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Scraping / Crawling 실습

## requests 패키지

### GET 방식 request

In [11]:
def getDownload( url, param = None, retries = 3 ):
    resp = None
    try:
        resp = requests.get( url, params = param )
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print( 'Retries : {0}'.format( retries ) )
            return getDownload( url, param, retries - 1 )
        else:
            print( resp.status_code )
            print( resp.reason )
            print( resp.request.headers )
    return resp

In [12]:
url = 'http://www.crawler-test.com/status_codes/status_500'
getDownload( url )

Retries : 3
Retries : 2
Retries : 1
500
Internal Server Error
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


<Response [500]>

### POST 방식 request

In [13]:
def postDownload( url, data = None, param = None, retries = 3 ):
    resp = None
    try:
        resp = requests.post( url, data, params = param )
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print( 'Retries : {0}'.format( retries ) )
            return postDownload( url, param, retries - 1 )
        else:
            print( resp.status_code )
            print( resp.reason )
            print( resp.request.headers )
    return resp

In [14]:
url = 'http://pythonscraping.com/pages/files/processing.php'
data = { 'firstname':'테스트', 'lastname':1234 }

In [15]:
html = postDownload( url, data )
print( html.request.body )
print( '------------------------------' )
print( html.request.headers )
html.text

firstname=%ED%85%8C%EC%8A%A4%ED%8A%B8&lastname=1234
------------------------------
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Length': '51', 'Content-Type': 'application/x-www-form-urlencoded'}


'Hello there, 테스트 1234!'

### Cookie 활용

In [16]:
def postDownloadCookie( url, data = None, param = None, cookie = None, retries = 3 ):
    resp = None
    try:
        resp = requests.post( url, data = data, cookies = cookie, params = param )
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print( 'Retries : {0}'.format( retries ) )
            return postDownloadCookie( url, data, param, cookie, retries - 1 )
        else:
            print( resp.status_code )
            print( resp.reason )
            print( resp.request.headers )
    return resp

In [17]:
url = 'http://pythonscraping.com/pages/files/processing.php'
data = { 'firstname':'test', 'lastname':1234 }

In [18]:
html = postDownloadCookie( url, data )
cookie = html.cookies.get_dict()

In [19]:
html = postDownloadCookie( url, data, cookie )
html.text

'Hello there, test 1234!'

In [20]:
session = requests.Session()

In [21]:
data = { 'username':'test', 'password':'password' }

In [22]:
html = session.post( url, data )

In [23]:
html.text

'Hello there,  !'

In [24]:
html = session.post( url )

In [25]:
html.text

'Hello there,  !'

## BeautifulSoup이용한 HTML 분석

In [26]:
html = '''
<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to page 1</a>
                <a class="blue">Go to page 2</a>
                <a class="green">Go to page 3</a>
                <a class="red">Go to page 4</a>
                <b class="yellow">Go to page 5</a>
                <c id="gray">Go to page 6</a>
                <d id="red">Go to page 7</a>
            </p>
        </div>
    </body>
</html>
'''

In [27]:
dom = BeautifulSoup( html, 'lxml' )

In [28]:
dom

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>BeautifulSoup Training</title>
</head>
<body>
<div id="result">
<p class="row">
<a class="red">Go to page 1</a>
<a class="blue">Go to page 2</a>
<a class="green">Go to page 3</a>
<a class="red">Go to page 4</a>
<b class="yellow">Go to page 5
                <c id="gray">Go to page 6
                <d id="red">Go to page 7
            </d></c></b></p>
</div>
</body>
</html>

### find() / find_all()이용한 원하는 tag 내용 추출

In [29]:
dom.find( 'a' )

<a class="red">Go to page 1</a>

In [30]:
dom.find_all( 'a' )

[<a class="red">Go to page 1</a>,
 <a class="blue">Go to page 2</a>,
 <a class="green">Go to page 3</a>,
 <a class="red">Go to page 4</a>]

In [31]:
dom.find( '', { "id":"result" } )

<div id="result">
<p class="row">
<a class="red">Go to page 1</a>
<a class="blue">Go to page 2</a>
<a class="green">Go to page 3</a>
<a class="red">Go to page 4</a>
<b class="yellow">Go to page 5
                <c id="gray">Go to page 6
                <d id="red">Go to page 7
            </d></c></b></p>
</div>

In [32]:
dom.find( '', { 'class':'red' } )

<a class="red">Go to page 1</a>

In [33]:
dom.find_all( '', { 'class':'red' } )

[<a class="red">Go to page 1</a>, <a class="red">Go to page 4</a>]

In [34]:
url = 'http://pythonscraping.com/pages/page3.html'
html = getDownload( url )
html.text

'<html>\n<head>\n<style>\nimg{\n\twidth:75px;\n}\ntable{\n\twidth:50%;\n}\ntd{\n\tmargin:10px;\n\tpadding:10px;\n}\n.wrapper{\n\twidth:800px;\n}\n.excitingNote{\n\tfont-style:italic;\n\tfont-weight:bold;\n}\n</style>\n</head>\n<body>\n<div id="wrapper">\n<img src="../img/gifts/logo.jpg" style="float:left;">\n<h1>Totally Normal Gifts</h1>\n<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is\nhand-curated by well-paid, free-range Tibetan monks.<p>\nWe haven\'t figured out how to make online shopping carts yet, but you can send us a check to:<br>\n123 Main St.<br>\nAbuja, Nigeria\n</br>We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</div>\n<table id="giftList">\n<tr><th>\nItem Title\n</th><th>\nDescription\n</th><th>\nCost\n</th><th>\nImage\n</th></tr>\n\n<tr id="gift1" class="gift"><td>\nVegetable Basket\n</td><td>\nThis vegetable basket is the perfec

In [35]:
dom = BeautifulSoup( html.text, 'lxml' )
dom

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [36]:
footer = dom.find( 'div', { 'id':'footer' } )
footer

<div id="footer">
© Totally Normal Gifts, Inc. <br/>
+234 (617) 863-0736
</div>

In [37]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [38]:
children = parent.find_all( recursive = False )
for row in children:
    print( row.name, row.attrs )

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [39]:
aList = dom.find_all( 'tr' )
aList

[<tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img

In [40]:
for row in aList:
    print( row.find_all( recursive = False )[ 2 ].text.strip() )

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


### select() / select_all()이용한 원하는 tag 내용 추출

In [41]:
html = '''
<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to page 1</a>
                <a class="blue">Go to page 2</a>
                <a class="green">Go to page 3</a>
                <a class="red">Go to page 4</a>
                <b class="yellow">Go to page 5</a>
                <c id="gray">Go to page 6</a>
                <d id="red">Go to page 7</a>
            </p>
        </div>
    </body>
</html>
'''

In [42]:
dom = BeautifulSoup( html, 'lxml' )
dom

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>BeautifulSoup Training</title>
</head>
<body>
<div id="result">
<p class="row">
<a class="red">Go to page 1</a>
<a class="blue">Go to page 2</a>
<a class="green">Go to page 3</a>
<a class="red">Go to page 4</a>
<b class="yellow">Go to page 5
                <c id="gray">Go to page 6
                <d id="red">Go to page 7
            </d></c></b></p>
</div>
</body>
</html>

In [43]:
dom.select_one( 'a' )

<a class="red">Go to page 1</a>

In [44]:
dom.select( 'a' )

[<a class="red">Go to page 1</a>,
 <a class="blue">Go to page 2</a>,
 <a class="green">Go to page 3</a>,
 <a class="red">Go to page 4</a>]

In [45]:
dom.select_one( '#gray' )

<c id="gray">Go to page 6
                <d id="red">Go to page 7
            </d></c>

In [46]:
dom.select_one( '.red' )

<a class="red">Go to page 1</a>

In [47]:
dom.select( '#gray' )

[<c id="gray">Go to page 6
                 <d id="red">Go to page 7
             </d></c>]

In [48]:
url = 'https://media.daum.net/issue/5008621'
html = requests.get( url )
html_text = html.text

In [49]:
dom = BeautifulSoup( html_text, 'lxml' )

In [50]:
news_list = dom.select( 
    '#cMain > div#mArticle > ul > li > div > strong.tit_thumb > a.link_txt' )

In [51]:
print( news_list[ 0 ] )

<a class="link_txt" href="http://v.media.daum.net/v/20200228192440612">독일 하루 동안 코로나19 확진자 22명..북부서도 발생</a>


In [52]:
for news in news_list:
    print( news )

<a class="link_txt" href="http://v.media.daum.net/v/20200228192440612">독일 하루 동안 코로나19 확진자 22명..북부서도 발생</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228192439611">국회, 코로나19 지원 서두른다..2월국회 내 추경 처리 가능...</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228191908511">강경화, 중국·베트남주재 공관장과 화상회의.."국민보호 만전"</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228191639465">문 대통령 "마스크 문제 송구..대책냈으니 내일·모레까지 효과...</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228191538437">대구서 자가격리 69세 여성 숨져..국내 14번째</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228190808275">문 대통령 "중국인 입국금지 불가능..지금은 실효성 없어"</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228190739266">이탈리아 정부, 무증상 감염자 확진자 통계서 배제 검토</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228190538224">박원순 "서울 신천지 627명 통화 안돼..이만희, 국민에 사...</a>
<a class="link_txt" href="http://v.media.daum.net/v/20200228190238178">세종·대구서 공무원 확진 잇따라..정부청사 관리 '초비상'</a>
<a class="link_txt"

In [53]:
news_lists = []
for index in range( 0, len( news_list ) ):
    news_lists.append( news_list[ index ].text )
    print( news_list[ index ].text )

독일 하루 동안 코로나19 확진자 22명..북부서도 발생
국회, 코로나19 지원 서두른다..2월국회 내 추경 처리 가능...
강경화, 중국·베트남주재 공관장과 화상회의.."국민보호 만전"
문 대통령 "마스크 문제 송구..대책냈으니 내일·모레까지 효과...
대구서 자가격리 69세 여성 숨져..국내 14번째
문 대통령 "중국인 입국금지 불가능..지금은 실효성 없어"
이탈리아 정부, 무증상 감염자 확진자 통계서 배제 검토
박원순 "서울 신천지 627명 통화 안돼..이만희, 국민에 사...
세종·대구서 공무원 확진 잇따라..정부청사 관리 '초비상'
정부, 한국민 입국제한국에 '여행주의보'.."방문 재고해야"


In [54]:
for text in news_lists:
    print( text )

독일 하루 동안 코로나19 확진자 22명..북부서도 발생
국회, 코로나19 지원 서두른다..2월국회 내 추경 처리 가능...
강경화, 중국·베트남주재 공관장과 화상회의.."국민보호 만전"
문 대통령 "마스크 문제 송구..대책냈으니 내일·모레까지 효과...
대구서 자가격리 69세 여성 숨져..국내 14번째
문 대통령 "중국인 입국금지 불가능..지금은 실효성 없어"
이탈리아 정부, 무증상 감염자 확진자 통계서 배제 검토
박원순 "서울 신천지 627명 통화 안돼..이만희, 국민에 사...
세종·대구서 공무원 확진 잇따라..정부청사 관리 '초비상'
정부, 한국민 입국제한국에 '여행주의보'.."방문 재고해야"


In [56]:
with open( '/nlp/news_list.txt', 'w', encoding = 'utf-8' ) as f:
    for text in news_lists:
        f.write( text + '\n' )

## Selenium - Webdriver 사용

In [57]:
path = '/nlp/chromedriver'
driver = webdriver.Chrome( path )

In [127]:
url = 'http://example.webscraping.com/places/default/search'
driver.get( url )

In [130]:
driver.find_element_by_id( 'search_term' ).clear()
driver.find_element_by_id( 'search_term' ).send_keys( 'korea' )
driver.find_element_by_id( 'search' ).click()

In [133]:
results = driver.find_element_by_id( 'results' )
for tag in results.find_elements_by_tag_name( 'a' ):
    print( tag.text )
    print( tag.get_attribute( 'href' ) )

North Korea
http://example.webscraping.com/places/default/view/North-Korea-165
South Korea
http://example.webscraping.com/places/default/view/South-Korea-211


In [134]:
driver.get( 'https://www.google.co.kr' )
search = driver.find_element_by_name( "q" )
search.send_keys( "파이썬" )
search.submit()

In [None]:
user = "facebook id"
pwd = "facebook password"

driver.get( 'https://www.facebook.com' )

element = driver.find_element_by_id( 'email' )
element.send_keys( user )
element = driver.find_element_by_id( 'pass' )
element.send_keys( pwd )
element.send_keys( Keys.RETURN )