In [128]:
import os
import time
import chromedriver_autoinstaller

# Analyze
import pandas as pd

# Camouflage
import random

# Verify
import numpy as np
import io
import base64
from PIL import Image
# tensorflow GPU isn't supported. So...
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from easyocr import Reader as easyocr_Reader

# Scrapy
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [92]:
# Variable
# city/area
city = '新北市'
# road data year
year = 110
# waiting time(seconds)
timeout = 6
# verify time min
threshold = 6

# Dataset source
dataset_pth = './opendata{}road.csv'.format(str(year))
roadurl = "https://data.gov.tw/dataset/35321"

# Scrapy site
bulmngurl = "https://building-management.publicwork.ntpc.gov.tw/bm_query.jsp?rt=3"

# debug
debug = True
execNum = 1
proxy = ""
user_agent = ""

In [49]:
def camouflage():
    """
    camouflage agents/proxy for scrapy
    """
    # agents
    user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
          'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
          'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
          (KHTML, like Gecko) Element Browser 5.0', \
          'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
          'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
          'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
          'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
           Version/6.0 Mobile/10A5355d Safari/8536.25', \
          'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
           Chrome/28.0.1468.0 Safari/537.36', \
          'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
    
    # proxy
    proxy = "http://113.200.214.164:9999"
    index = random.randint(0, (len(user_agents) - 1))
    user_agent = user_agents[index]
    return proxy, user_agent

In [136]:
class webbrowser:
    def __init__(self, url:str, timeout:int = 6, \
                 proxy:str = "" , user_agent:str = "", \
                 executable_path:str = "", debug:bool = False):
        """
        url: web url;
        timeout: waiting time(seconds);
        proxy: proxy;
        user_agent: user agent;
        executable_path: chrome path;
        debug: debug or not
        """
        # chrome path
        self.executable_path = executable_path \
            if executable_path \
            else str(chromedriver_autoinstaller.install())          
        self.debug = debug
        
        # button
        self._element = None
        
        # window handle
        self.handle = None
        
        # web driver
        self._options = webdriver.ChromeOptions()
        prefs = {"download.default_directory" : os.getcwd()}
        self._options.add_experimental_option("prefs",prefs)
        self.driver = webdriver.Chrome(executable_path = self.executable_path, options = self._options)
        self._options.add_argument("--incognito")        
        
        # non-standard location
        if executable_path:
            self._options.binary_location = str(executable_path) 
        
        if not debug:
            self._options.add_argument("headless")
        
        # if any(proxy/user_agent) is empty,
        # useing camouflage function to get virtual proxy/user_agent
        if not proxy or not user_agent:
            proxy, user_agent = camouflage()
        if debug: 
            print("chrome driver: {}".format(self.executable_path))  
            print("user_agent: {}".format(user_agent))
            print("proxy: {}".format(proxy))
            print("chrome url: {}".format(url))
        self._options.add_argument("user-agent={}".format(user_agent))
        self._options.add_argument("--proxy-server={}".format(proxy))
                    
        self.driver.get(url)
        
        # waiting time(seconds)
        self._timeout = timeout
        # waitting driver
        self._waitDriver = WebDriverWait(self.driver, self._timeout)
        
        # pull Image script
        self._drawImage_js_script = """
            var ele = arguments[0];
            var cnv = document.createElement('canvas');
            cnv.width = ele.width; cnv.height = ele.height;
            cnv.getContext('2d').drawImage(ele, 0, 0);
            return cnv.toDataURL('image/jpeg').substring(22);    
            """
    def get_table(self, css_selector:str = "", html = "") -> pd.DataFrame:
        """
        get dataframe;
        html: this page;
        return: table;
        """
        table = pd.DataFrame.empty
        html = html if html else self.driver.page_source
        css_selector = css_selector if css_selector else "table.responstable"
        
        soup = BeautifulSoup(html, 'html.parser')
        div = soup.select_one(_css_selector)
        self.random_wait()
        tbs = pd.read_html(str(div))
        table = pd.concat(tbs , ignore_index=True)
        return table
        
    def pull_image(self, css_selector:str = "", to_save:bool = False, img_pth:str = ""):
        """
        pull image to verify, Remember to close the image!!;
        css_selector: css selector(default: #codeimg);
        to_save: save or not;
        img_pth: save path;
        return: image;
        """
        css_selector = css_selector if css_selector else "#codeimg"
        element = self._waitDriver.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
        img_base64 = self.driver.execute_script(self._drawImage_js_script, element)
        self.random_wait()
        
        # decode
        img_64decode = base64.b64decode(img_base64)
        # to stream
        imageStream = io.BytesIO(img_64decode)
        img = Image.open(imageStream)
        # save
        if to_save:
            img_pth = img_pth if img_pth else "buling_linces.png"
            with open(img_pth, 'wb') as image:
                image.write(img_64decode)
        
        # # Only test show image
        # from matplotlib import pyplot as plt
        # plt.subplot(122),plt.imshow(img)
        # plt.show()
        
        return img
        
    def verify(self, image = None, model = ['ch_sim','en']):
        """
        pull image , and verfy it as code;
        image: image;
        return: code;
        """        
        image = None or image
        if image == None:            
            image = self.pull_image()
            self.random_wait()
            if self.debug and image != None: 
                print("excute pull_image successfully!")
            elif self.debug:
                print("excute pull_image failed!")
        img_arr = np.array(image)
        image.close()
        reader = easyocr_Reader(model) # this needs to run only once to load the model into memory
        self.random_wait()
        try:
            result = reader.readtext(img_arr, detail = 0)
            self.random_wait()
        except Exception as ex:
            # timeout
            if self.debug: print(ex)
            result = reader.readtext(img_arr, detail = 0)
            self.random_wait(multiplier = 3, threshold = threshold)
        finally:
            if self.debug: print("code: {}".format(result[0]))
        return result[0]
        
    def set_timeout(self, timeout:int):
        """
        set timeout;
        timeout: waiting time(seconds);
        return: None;
        """
        # waiting time(seconds)
        self._timeout = timeout
        # waitting driver
        self._waitDriver = WebDriverWait(self.driver, self._timeout)
        
    def click_directly(self, css_selector:str):
        """
        find element by css_selector, 
        and click the element;
        css_selector: css selector;
        return: None;
        """
        self._element = self.driver.find_element_by_css_selector(css_selector)
        self._element.click()
        
    def presence_located(self, css_selector:str, text:str = None):
        """
        presence to be located by css-selector and query keyword by inner-text;
        css_selector: css selector;
        text: inner text;
        return: located-element;
        """
        self._waitDriver.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
        self._querykey(css_selector, text)
        return self._element
        
    def click_able(self, css_selector:str, text:str = None):
        """
        wait to be clickable by css-selector and query keyword by inner-text;
        css_selector: css selector;
        text: inner text;
        return: clickable-element;
        """
        self._waitDriver.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector)))
        self._querykey(css_selector, text)
        return self._element
        
    def _querykey(self, css_selector:str, text:str = None):
        """
        query elements by css-selector and query keyword by inner-text;
        css_selector: css selector;
        text: inner text;
        return: None;
        """
        text = None or text
        elements = self.driver.find_elements_by_css_selector(css_selector)
        self._element = elements if len(elements) != 0 else elements
        if text != None:
            for element in elements:
                if text == element.text:
                    self._element = element
                    break                    
        self.random_wait()
        
    def random_wait(self, multiplier:int = 1, threshold:int = 0):
        """
        wait randomly in timeout
        return: None;
        """
        rnd = random.randint(1, (self._timeout - 1))
        rnd = rnd * multiplier
        if threshold != 0 and rnd < threshold:
            if debug: print("using threshold; orign: {}".format(str(rnd)))
            rnd = threshold
        if self.debug: print("waiting {} seconds...".format(str(rnd)))
        self.driver.implicitly_wait(rnd)
        
    def switch_window(self, children = True):
        """
        switch to window;
        children: children window or 
            parent window;
        return: None;
        """
        if children:
            self.handle = self.driver.current_window_handle
            handles = self.driver.window_handles
            self.driver.switch_to.window(handles.pop())
        else:
            self.driver.switch_to.window(self.handle)
        self.random_wait()
        
    def close(self):
        """
        close web driver;
        return: None;
        """
        self.driver.close()

In [137]:
def get_road(dataset_pth:str, roadurl:str, timeout:int, proxy:str, user_agent:str, debug:bool = False):
    """
    reading roads;
    dataset_pth: dataset path;
    roadurl: dataset source url;
    timeout: waiting time(seconds);
    return: roads in data;
    """
    is_exists = lambda pth: os.path.exists(pth)
    
    if not is_exists(dataset_pth):
        roadweb = webbrowser(roadurl, timeout, proxy = proxy, user_agent = user_agent, executable_path = "", debug = debug)
        try:
            itemarea = "div.download-item"
            downloadSelector = "button.el-button.el-button--primary.el-button--mini.is-plain"
            btnareas = roadweb.presence_located(itemarea)
            flag = True
            for btnarea in btnareas:
                if str(year) in btnarea.text:
                    download_btn = btnarea.find_element_by_css_selector(downloadSelector)
                    roadweb.random_wait()
                    download_btn.click()
                    flag = False                    
                    break
            if flag:
                raise Exception("Year is out of range!! year: {}".format(str(year)))
        except ex as Exception:
            print(ex)
        finally:
            if not is_exists(dataset_pth):
                time.sleep(timeout)
            roadweb.close()
    roads = pd.read_csv(dataset_pth, encoding='utf_8_sig')

    # debug
    if debug: 
        join_txt = "not " if not is_exists else ""
        join_pth = dataset_pth.replace('./', '{}\\'.format(os.getcwd()))
        print("'{}' was {}exists".format(join_pth, join_txt))

    # get dataset
    condition = roads.iloc[:,0] == city
    data = roads[condition].iloc[:,1:]
    return data

In [139]:
## test2
browser = webbrowser(bulmngurl, timeout, proxy = proxy, user_agent = user_agent, executable_path = "", debug = debug)

# 市/區 視窗
_css_selector = "input#D1V.getDataTxt"
browser.presence_located(_css_selector)[0].click()
browser.switch_window(True)

# 市/區 選擇
_css_selector = "div.div_w_85"
browser.click_able(_css_selector, city_area).click()
browser.switch_window(False)

# 道路
_css_selector = "input#D3"
browser.presence_located(_css_selector)[0].send_keys(city_road)

# 驗證
verify_code = browser.verify()
_css_selector = "input#Z1"
browser.presence_located(_css_selector)[0].send_keys(verify_code)

chrome driver: C:\ProgramData\Anaconda3\lib\site-packages\chromedriver_autoinstaller\99\chromedriver.exe
user_agent: Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko)            Version/6.0 Mobile/10A5355d Safari/8536.25
proxy: http://113.200.214.164:9999
chrome url: https://building-management.publicwork.ntpc.gov.tw/bm_query.jsp?rt=3
waiting 2 seconds...
waiting 3 seconds...




waiting 3 seconds...
waiting 1 seconds...
waiting 5 seconds...
waiting 2 seconds...
waiting 1 seconds...
excute pull_image successfully!
waiting 5 seconds...
Unknown C++ exception from OpenCV code


UnboundLocalError: local variable 'result' referenced before assignment

In [None]:
## test2 contiune
# 查詢
_css_selector = "button.bouton-contact"
browser.presence_located(_css_selector)[0].click()
# 選表
_css_selector = "table.responstable"
dataframe = browser.get_table(_css_selector)
# 關閉
browser.close()

In [126]:
# html = browser.driver.page_source
# soup = BeautifulSoup(html, 'html.parser')
# div = soup.select_one(_css_selector)
# tbs = pd.read_html(str(div))
# table = pd.concat(tbs , ignore_index=True)

In [127]:
dataframe

Unnamed: 0,使照號碼,建照號碼,起造人,設計人,建築地址(代表號),發照日期
0,43板使字第00024號,--,遠OO造廠代表徐有庠,文明,新北市板橋區深丘里中山路50號,043/06/29
1,58淡使字第00238-0A號,57淡建字第00238號,遠OO織股份有限公司 徐有序,陳坤榮,新北市板橋區中山路二段129號,
2,58板使字第00322號,57板建字第01014號,詹OO娥,蔡章文,新北市板橋區中山路,
3,58板使字第00497號,57板建字第01099號,陳OO,徐雨壽,新北市板橋區深丘里中山路2段65號,
4,58板使字第00498號,57板建字第01098號,鄭OO,徐雨壽,新北市板橋區深丘里中山路2段67號,
5,59板使字第00374號,58板建字第00617號,江OO等4人,陳阿芳,新北市板橋區中山路1巷6號,
6,60板使字第00207號,59板建字第00593號,陳OO惠,呂禮謙,新北市板橋區中山路50號,
7,第一頁 上一頁 [ 第1頁，共76頁 ] 下一頁 最後頁,第一頁 上一頁 [ 第1頁，共76頁 ] 下一頁 最後頁,第一頁 上一頁 [ 第1頁，共76頁 ] 下一頁 最後頁,第一頁 上一頁 [ 第1頁，共76頁 ] 下一頁 最後頁,第一頁 上一頁 [ 第1頁，共76頁 ] 下一頁 最後頁,第一頁 上一頁 [ 第1頁，共76頁 ] 下一頁 最後頁


In [66]:
roads = get_road(dataset_pth, roadurl, timeout, proxy, user_agent, debug = debug)

'C:\Users\Mu\Documents\Python Scripts\da_project\opendata110road.csv' was exists


In [67]:
# Normal main()
execNum = None or execNum
execNum = len(roads.index) if execNum == None else execNum
for i, index in enumerate(roads.index):
    city_area = roads.loc[index,:][0]
    city_road = roads.loc[index,:][1]
    
    browser = webbrowser(bulmngurl, timeout, proxy = proxy, user_agent = user_agent, executable_path = "", debug = debug)
    
    try:
        # 市/區 視窗
        _css_selector = "input#D1V.getDataTxt"
        browser.presence_located(_css_selector)[0].click()
        browser.switch_window(True)

        # 市/區 選擇
        _css_selector = "div.div_w_85"
        browser.click_able(_css_selector, city_area).click()
        browser.switch_window(False)

        # 道路
        _css_selector = "input#D3"
        browser.presence_located(_css_selector)[0].send_keys(city_road)

        # 驗證
        verify_code = browser.verify()
        _css_selector = "input#Z1"
        browser.presence_located(_css_selector)[0].send_keys(verify_code)

        # 查詢
        _css_selector = "button.bouton-contact"
        browser.presence_located(_css_selector)[0].click()

        # 選表
        _css_selector = "table.responstable"
        dataframe = browser.get_table(_css_selector)
    except ex as Exception:
            print(ex)
    finally:
        # 關閉
        browser.close()
    
    if i == execNum:
        break

chrome driver: C:\ProgramData\Anaconda3\lib\site-packages\chromedriver_autoinstaller\99\chromedriver.exe
user_agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0
proxy: http://113.200.214.164:9999
chrome url: https://building-management.publicwork.ntpc.gov.tw/bm_query.jsp?rt=3
waiting 5 seconds...
waiting 0 seconds...
waiting 3 seconds...
waiting 5 seconds...
waiting 3 seconds...
prepare to excute pull_image...
waiting 4 seconds...




waiting 4 seconds...
waiting 0 seconds...
waiting 1 seconds...
chrome driver: C:\ProgramData\Anaconda3\lib\site-packages\chromedriver_autoinstaller\99\chromedriver.exe
user_agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+           (KHTML, like Gecko) Element Browser 5.0
proxy: http://113.200.214.164:9999
chrome url: https://building-management.publicwork.ntpc.gov.tw/bm_query.jsp?rt=3
waiting 2 seconds...
waiting 1 seconds...
waiting 0 seconds...
waiting 2 seconds...
waiting 1 seconds...
prepare to excute pull_image...




waiting 4 seconds...
waiting 5 seconds...
waiting 3 seconds...
waiting 2 seconds...


缺驗證碼

In [261]:
browser = webbrowser(bulmngurl, timeout, proxy = proxy, user_agent = user_agent, executable_path = "", debug = debug)

chrome driver: C:\ProgramData\Anaconda3\lib\site-packages\chromedriver_autoinstaller\99\chromedriver.exe
user_agent: Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko)            Version/6.0 Mobile/10A5355d Safari/8536.25
proxy: http://113.200.214.164:9999


In [None]:
# #執照類型
# _css_selector = "input#A2V.getDataTxt"
# browser.click_directly(_css_selector)
# browser.switch_window(True)
# 
# #使用執照
# keyword = '使用執照'
# _css_selector = "div.div_w_130"
# browser.click_able(_css_selector, keyword).click()
# browser.switch_window(False)

In [262]:
#地址填寫
_css_selector = "input#D1V.getDataTxt"
browser.presence_located(_css_selector)[0].click()
browser.switch_window(True)

waiting 2 seconds...
waiting 0 seconds...


In [263]:
#市/區 #第1筆
keyword = city_area
_css_selector = "div.div_w_85"
browser.click_able(_css_selector, keyword).click()
browser.switch_window(False)

waiting 2 seconds...
waiting 2 seconds...


In [265]:
#道路 #第3筆
_css_selector = "input#D3"
browser.presence_located(_css_selector)[0].send_keys(city_road)

waiting 1 seconds...
