# 空氣污染監測網 網路爬蟲實作練習


* 能夠利用 selenium + BeautifulSoup 撰寫爬蟲，並存放到合適的資料結構


## 作業目標

根據範例 ，完成以下問題：

* ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料
* ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料





In [2]:
# 打開瀏覽器
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
browser = webdriver.Chrome(executable_path='/Applications/Google Chrome.app/Contents/MacOS/chromedriver')
browser.get("http://taqm.epa.gov.tw/taqm/tw/MonthlyAverage.aspx")

In [3]:
# 模擬使用者操作行為，選擇/點擊
selectSite = Select(browser.find_element_by_id("ctl05_ddlSite"))
selectSite.select_by_value('11')
selectYear = Select(browser.find_element_by_id("ctl05_ddlYear"))
selectYear.select_by_value('2018')
browser.find_element_by_id('ctl05_btnQuery').click()

In [4]:
# 取得資料，丟到 BeautifulSoup 解析
html_source = browser.page_source
soup = BeautifulSoup(html_source, 'lxml')
table = soup.find('table', class_='TABLE_G')

In [5]:
import re
date_pattern = re.compile('\d{4}\/\d{2}')
value_pattern = re.compile('\d')

In [6]:
table.find('td').has_attr('style')

True

In [7]:
d = {}
for i,j in enumerate(table.find_all('td')):
#     print(i,j)
    if j.has_attr('style'):
        col = j.text
        d[col] = {}
    else:
        if date_pattern.match(j.text):
            date = j.text
            d[col][date] = {}
        elif value_pattern.match(j.text):
            value = j.text
            d[col][date] = value

In [8]:
print(d)

{'SO2': {'2018/01': '1.80', '2018/02': '1.90', '2018/03': '2.20', '2018/04': '2.30', '2018/05': '3.10', '2018/06': '2.70', '2018/07': '2.20', '2018/08': '2.40', '2018/09': '2.10', '2018/10': '1.70', '2018/11': '1.90', '2018/12': '1.80'}, 'CO': {'2018/01': '0.34', '2018/02': '0.44', '2018/03': '0.40', '2018/04': '0.38', '2018/05': '0.34', '2018/06': '0.29', '2018/07': '0.21', '2018/08': '0.30', '2018/09': '0.26', '2018/10': '0.29', '2018/11': '0.30', '2018/12': '0.35'}, 'O3': {'2018/01': '33.40', '2018/02': '32.50', '2018/03': '35', '2018/04': '38.40', '2018/05': '31.60', '2018/06': '29.50', '2018/07': '18.70', '2018/08': '26.40', '2018/09': '29.10', '2018/10': '45.90', '2018/11': '32.40', '2018/12': '30.70'}, 'PM10': {'2018/01': '23', '2018/02': '41', '2018/03': '39', '2018/04': '48', '2018/05': '37', '2018/06': '26', '2018/07': '24', '2018/08': '26', '2018/09': '28', '2018/10': '33', '2018/11': '25', '2018/12': '21'}, 'NOx': {'2018/01': '14.17', '2018/02': '19.43', '2018/03': '18.50',

In [9]:
import pandas as pd
df = pd.DataFrame(d)
df

Unnamed: 0,SO2,CO,O3,PM10,NOx,NO,NO2,THC,NMHC,CH4
2018/01,1.8,0.34,33.4,23,14.17,3.41,10.77,,,
2018/02,1.9,0.44,32.5,41,19.43,4.48,14.95,,,
2018/03,2.2,0.4,35.0,39,18.5,4.93,13.56,,,
2018/04,2.3,0.38,38.4,48,15.24,2.6,12.64,,,
2018/05,3.1,0.34,31.6,37,15.2,2.26,12.94,,,
2018/06,2.7,0.29,29.5,26,13.79,2.41,11.38,,,
2018/07,2.2,0.21,18.7,24,11.6,2.68,8.92,,,
2018/08,2.4,0.3,26.4,26,14.75,2.68,12.08,,,
2018/09,2.1,0.26,29.1,28,12.4,2.41,10.0,,,
2018/10,1.7,0.29,45.9,33,12.45,2.13,10.33,,,


### ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料

In [10]:
q1_pattern = re.compile('2018\/[0][1-8]')

In [11]:
d = {}
for i,j in enumerate(table.find_all('td')):
    if j.text == 'SO2':
        col = j.text
        d[col] = {}
    else:
        if q1_pattern.match(j.text):
            date = j.text
            d[col][date] = {}
        elif value_pattern.match(j.text):
            value = j.text
            d[col][date] = value
    if j.text == 'CO':
        break

In [12]:
import pandas as pd
df = pd.DataFrame(d)
df

Unnamed: 0,SO2
2018/01,1.8
2018/02,1.9
2018/03,2.2
2018/04,2.3
2018/05,3.1
2018/06,2.7
2018/07,2.2
2018/08,1.8


### ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料

In [13]:
d = {}
for i,j in enumerate(table.find_all('td')):
    if j.has_attr('style') and j.text != 'O3':
        col = j.text
        d[col] = {}
    else:
        if q1_pattern.match(j.text):
            date = j.text
            d[col][date] = {}
        elif value_pattern.match(j.text):
            value = j.text
            d[col][date] = value
    if j.text == 'O3':
        break

In [14]:
import pandas as pd
df = pd.DataFrame(d)
df

Unnamed: 0,SO2,CO
2018/01,1.8,0.34
2018/02,1.9,0.44
2018/03,2.2,0.4
2018/04,2.3,0.38
2018/05,3.1,0.34
2018/06,2.7,0.29
2018/07,2.2,0.21
2018/08,1.8,0.35
