# Project: mapIt.py with the webbrowser Module

webbrowser.open('URL網頁地址') 打開瀏覽器

In [1]:
import webbrowser
webbrowser.open('http://inventwithpython.com/')

True

以下 mapIt.py 需要在command line中執行，檔案資料夾要設定在系統PATH當中才可以直接執行

sys.argv是紀錄輸入command line的字串

In [10]:
#! python3
# mapIt.py - Launches a map in the browser using an address from the command line or clipboard.

import webbrowser, sys, pyperclip

#sys.argv是輸入cmd的指令list
if len(sys.argv) > 1:
    # Get address from command line.
    address = ' '.join(sys.argv[1:])

else:
    # Get address from clipboard.
    address = pyperclip.paste()

#Launch browser
webbrowser.open('https://www.google.com/maps/place/' + address)  

True

在command line中輸入: mapIt Taiwan，即可自動開啟瀏覽器查詢Taiwan地址

# Downloading Files from the Web with the requests Module

requests.get('URL') 可以下載該檔案

In [3]:
import requests
res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
type(res)

requests.models.Response

In [4]:
res.status_code == requests.codes.ok

True

In [5]:
len(res.text)

178981

In [6]:
print(res.text[:250]) #列出前250個字

﻿The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Proje


res.raise_for_status() 可以看下載狀況

In [8]:
res = requests.get('http://inventwithpython.com/page_that_does_not_exist')
res.raise_for_status() #

HTTPError: 404 Client Error: Not Found for url: http://inventwithpython.com/page_that_does_not_exist

In [9]:
import requests
res = requests.get('http://inventwithpython.com/page_that_does_not_exist')

#使用try/except使程式執行
try:
    res.raise_for_status()
except Exception as exc:
    print('There was a problem: %s' % (exc))

There was a problem: 404 Client Error: Not Found for url: http://inventwithpython.com/page_that_does_not_exist


# Saving Downloaded Files to the Hard Drive

iter_content() method 回傳 “chunks”(區塊) of the content on eachiteration through the loop.

In [11]:
import requests
res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
res.raise_for_status()

playFile = open('RomeoAndJuliet.txt', 'wb') #write binary mode.
for chunk in res.iter_content(100000):
    playFile.write(chunk)
playFile.close()

Review

# Parsing HTML with the BeautifulSoup Module

要指定 bs4.BeautifulSoup(res.text, "html.parser")後面的那個語法，才不會Warning

In [14]:
import requests, bs4
res = requests.get('http://nostarch.com')
res.raise_for_status()
noStarchSoup = bs4.BeautifulSoup(res.text, "html.parser")
type(noStarchSoup)

bs4.BeautifulSoup

In [15]:
exampleFile = open('example.html')
exampleSoup = bs4.BeautifulSoup(exampleFile, "html.parser") #不打後面會有warning
type(exampleSoup)

bs4.BeautifulSoup

select() Method 尋找其中element

In [23]:
import bs4
exampleFile = open('example.html')
exampleSoup = bs4.BeautifulSoup(exampleFile.read(), "html.parser")

#尋找標籤中有author的一段
elems = exampleSoup.select('#author') 

type(elems)

list

In [18]:
len(elems)

1

In [19]:
type(elems[0])

bs4.element.Tag

.getText()獲取標籤中間的字串

In [31]:
# .getText()獲取標籤中間的字串
elems[0].getText()

'Al Sweigart'

In [21]:
str(elems[0])

'<span id="author">Al Sweigart</span>'

In [22]:
elems[0].attrs

{'id': 'author'}

In [30]:
# 尋找所有標籤有<p>的
pElems = exampleSoup.select('p')
str(pElems[0])

'<p>Download my <strong>Python</strong> book from <a href="http://\ninventwithpython.com">my website</a>.</p>'

In [32]:
# .getText()獲取標籤中間的字串
pElems[0].getText()

'Download my Python book from my website.'

In [26]:
str(pElems[1])

'<p class="slogan">Learn Python the easy way!</p>'

In [27]:
pElems[1].getText()

'Learn Python the easy way!'

In [28]:
str(pElems[2])

'<p>By <span id="author">Al Sweigart</span></p>'

In [29]:
pElems[2].getText()

'By Al Sweigart'

Getting Data from an Element’s Attributes

The get() method for Tag objects makes it simple to access attribute values from an element

In [37]:
import bs4
soup = bs4.BeautifulSoup(open('example.html'),"html.parser")
spanElem = soup.select('span')[0]
str(spanElem)

'<span id="author">Al Sweigart</span>'

In [38]:
spanElem.get('id')

'author'

In [39]:
spanElem.get('some_nonexistent_addr') == None

True

In [40]:
spanElem.attrs

{'id': 'author'}

# Project: “I’m Feeling Lucky” Google Search

In [42]:
# 這個程式要command line直接開

#! python3
# lucky.py - Opens several Google search results.

import requests, sys, webbrowser, bs4
print('Googling...') # display text while downloading the Google page
res = requests.get('http://google.com/search?q=' + ' '.join(sys.argv[1:]))
res.raise_for_status()

# Retrieve top search result links.
soup = bs4.BeautifulSoup(res.text, "html.parser")

# Open a browser tab for each result.
# the selector '.r a' to find all <a> elements that are within an element that has the r CSS class.
linkElems = soup.select('.r a') 
numOpen = min(5, len(linkElems))
for i in range(numOpen):
    webbrowser.open('http://google.com' + linkElems[i].get('href'))

Googling...


# Project: Downloading All XKCD Comics

In [None]:
#! python3
# downloadXkcd.py - Downloads every single XKCD comic.

import requests, os, bs4

url = 'http://xkcd.com' # starting url
os.makedirs('xkcd', exist_ok=True) # store comics in ./xkcd

while not url.endswith('#'):
    
    # Download the page.
    print('Downloading page %s...' % url)
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    
    # Find the URL of the comic image.
    comicElem = soup.select('#comic img')
    
    if comicElem == []:
        print('Could not find comic image.')
    else:
        comicUrl = comicElem[0].get('src')
        # Download the image.
        print('Downloading image %s...' % (comicUrl))
        res = requests.get("http:" + comicUrl)
        res.raise_for_status()
    
        # Save the image to ./xkcd.
        imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
        for chunk in res.iter_content(100000):
            imageFile.write(chunk)
        imageFile.close()
    
    # Get the Prev button's url.
    prevLink = soup.select('a[rel="prev"]')[0]
    url = 'http://xkcd.com' + prevLink.get('href')
    
print('Done.')

Downloading page http://xkcd.com...
Downloading image //imgs.xkcd.com/comics/communicating.png...
Downloading page http://xkcd.com/1859/...
Downloading image //imgs.xkcd.com/comics/sports_knowledge.png...
Downloading page http://xkcd.com/1858/...
Downloading image //imgs.xkcd.com/comics/4th_of_july.png...
Downloading page http://xkcd.com/1857/...
Downloading image //imgs.xkcd.com/comics/emoji_movie.png...
Downloading page http://xkcd.com/1856/...
Downloading image //imgs.xkcd.com/comics/existence_proof.png...
Downloading page http://xkcd.com/1855/...
Downloading image //imgs.xkcd.com/comics/telephoto.png...
Downloading page http://xkcd.com/1854/...
Downloading image //imgs.xkcd.com/comics/refresh_types.png...
Downloading page http://xkcd.com/1853/...
Downloading image //imgs.xkcd.com/comics/once_per_day.png...
Downloading page http://xkcd.com/1852/...
Downloading image //imgs.xkcd.com/comics/election_map.png...
Downloading page http://xkcd.com/1851/...
Downloading image //imgs.xkcd.com

# Controlling the Browser with the selenium Module

先安裝selenium到conda，然後安裝Firefox第三方軟體geckodriver到PATH當中

In [4]:
from selenium import webdriver
browser = webdriver.Firefox()
type(browser)

selenium.webdriver.firefox.webdriver.WebDriver

In [5]:
browser.get('http://inventwithpython.com')

Finding Elements on the Page

In [6]:
from selenium import webdriver
browser = webdriver.Firefox()
browser.get('http://inventwithpython.com')

try:
    #尋找 Elements that use the CSS class name
    elem = browser.find_element_by_class_name('bookcover') 
    print('Found <%s> element with that class name!' % (elem.tag_name))
except:
    print('Was not able to find an element with that name.')

Found <img> element with that class name!


帶目標tag的變數.click() 點按鍵

In [7]:
from selenium import webdriver
browser = webdriver.Firefox()
browser.get('http://inventwithpython.com')

#尋找<a> elements that completely match the text provided
linkElem = browser.find_element_by_link_text('Read It Online') 
type(linkElem)

linkElem.click() # follows the "Read It Online" link

Filling Out and Submitting Forms

driver.implicitly_wait(10) 讓網站休息等待目標跳出來!!

In [28]:
from selenium import webdriver
browser = webdriver.Firefox()
browser.get('http://gmail.com')

emailElem = browser.find_element_by_id('identifierId')
emailElem.send_keys('@gmail.com')
linkElem = browser.find_element_by_id('identifierNext')
linkElem.click() #給我努力找到那個可以click的動作類別!!

driver = webdriver.Firefox()
driver.implicitly_wait(10) # seconds
#換頁面等10秒

passwordElem = browser.find_element_by_id('password')
passwordElem.send_keys('')
linkElem = browser.find_element_by_id('passwordNext')
linkElem.click()

Sending Special Keys

In [32]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
browser.get('http://nostarch.com')
browser.implicitly_wait(10) # seconds
#換頁面等10秒

htmlElem = browser.find_element_by_tag_name('body')
#簡單對整個html做事情，但是為什麼html標籤不可以呢?

htmlElem.send_keys(Keys.END) # scrolls to bottom
htmlElem.send_keys(Keys.HOME) # scrolls to top

Clicking Browser Button

In [None]:
browser.back()     Clicks the Back button.
browser.forward()  Clicks the Forward button.
browser.refresh()  Clicks the Refresh/Reload button.
browser.quit()     Clicks the Close Window button.