In [81]:
'''
匯入套件
'''
# 操作 browser 的 API
from selenium.webdriver.chrome.service import Service
from selenium import webdriver

# 處理逾時例外的工具
from selenium.common.exceptions import TimeoutException

# 面對動態網頁，等待某個元素出現的工具，通常與 exptected_conditions 搭配
from selenium.webdriver.support.ui import WebDriverWait

# 搭配 WebDriverWait 使用，對元素狀態的一種期待條件，若條件發生，則等待結束，往下一行執行
from selenium.webdriver.support import expected_conditions as EC

# 期待元素出現要透過什麼方式指定，通常與 EC、WebDriverWait 一起使用
from selenium.webdriver.common.by import By

# 加入行為鍊 ActionChain (在 WebDriver 中模擬滑鼠移動、點繫、拖曳、按右鍵出現選單，以及鍵盤輸入文字、按下鍵盤上的按鈕等)
from selenium.webdriver.common.action_chains import ActionChains

# 強制等待 (執行期間休息一下)
from time import sleep

# 整理 json 使用的工具
import json

# 執行 command 的時候用的
import os

import re

In [82]:
'''
selenium 啓動 Chrome 的進階配置參數
參考網址：https://stackoverflow.max-everyday.com/2019/12/selenium-chrome-options/
'''
# 啟動瀏覽器工具的選項
my_options = webdriver.ChromeOptions()
my_options.add_argument("--headless")                #不開啟實體瀏覽器背景執行
# my_options.add_argument("--start-maximized")         #最大化視窗
my_options.add_argument("--incognito")               #開啟無痕模式
my_options.add_argument("--disable-popup-blocking") #禁用彈出攔截
my_options.add_argument("--disable-notifications")  #取消 chrome 推播通知
my_options.add_argument("--lang=zh-TW")  #設定為正體中文


# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(
    options = my_options,
    service = Service(executable_path="./chromedriver.exe")
)

In [83]:
filePath = "Bookshelf_Selenium"
if not os.path.exists(filePath):
    os.mkdir(filePath)

In [84]:
driver.get("https://www.gutenberg.org/browse/languages/zh")

In [85]:
# books = []
# titles = driver.find_elements(By.CSS_SELECTOR, "li.pgdbetext > a")

# for title in titles:
#     if re.search(r"[\u4E00-\u9FFF]+", title.text) != None:
#         books.append({"Name": title.text,"URL": title.get_attribute('href'),"location": title.location})

In [86]:
try:
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located(
                (By.CSS_SELECTOR, "li.pgdbetext > a")
            )
    )
    titles = driver.find_elements(By.CSS_SELECTOR, "li.pgdbetext > a")
except TimeoutException:
    print("Page Books 等待逾時!")

In [87]:
books = []
for title in titles:
    if re.search(r"[\u4E00-\u9FFF]+", title.text) != None:
        books.append(title.text)

In [88]:
len(books)

478

In [89]:
# 建立行為鍊
ac = ActionChains(driver)

In [90]:
for name in books:
    try:
        WebDriverWait(driver, 5).until(
#             EC.title_is("Browse By Language: Chinese | Project Gutenberg"
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "li.pgdbetext > a")
            )
        ) 
        book_name = driver.find_element(By.LINK_TEXT, name)
        ac.click(book_name).perform()
        
    except TimeoutException:
        print("Page Books 等待逾時!")

    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located(
                (By.LINK_TEXT, "Plain Text UTF-8")
            )
        )     
        txt = driver.find_element(By.LINK_TEXT, "Plain Text UTF-8")
        ac.click(txt).perform()
        
    except TimeoutException:
        print("Page Select Sources 等待逾時!")
        
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located(
            (By.TAG_NAME, "pre")
            )
        )
        texts = driver.find_element(By.TAG_NAME, "pre")
        contents = re.findall("[\u4E00-\u9FFF]+[\W]+", texts.text)
        
    except TimeoutException:
        print("Page Source 等待逾時!")
    
#     name_ = re.sub(r"\s", " ", name)
    with open(f"{filePath}/{re.sub(r'\s', ' ', name)}.txt", "ab") as f:
        f.write(" ".join(contents).encode("UTF-8"))
    
    lst = os.listdir(filePath)
    if len(lst) > 100:
        break
        
    driver.get("https://www.gutenberg.org/browse/languages/zh")