In [2]:
import requests
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from io import BytesIO
import base64
#import pytesseract
from paddleocr import PaddleOCR, draw_ocr
import os
import shutil
import datetime

# 取得今天的日期
today = datetime.date.today()

# 格式化成 年-月-日 (例如：2025-01-23)
formatted_date = today.strftime('%Y-%m-%d')
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [3]:
def get_stock_initial():
    options = Options()
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_experimental_option("prefs", {
        "download.default_directory": "f:\\dealer_temp\\",
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True
    })

    # 設定 Selenium 驅動程式
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)

    # 目標網址
    url = 'https://bsr.twse.com.tw/bshtm/bsMenu.aspx'

    # 開啟網頁
    driver.get(url)
    
    ocr = PaddleOCR(use_angle_cls=True,lang='en',det_db_thresh=0, det_db_box_thresh=0,det_db_unclip_ratio=0,det_db_score_mode='fast',use_gpu=True,show_log=True)
    #ocr = PaddleOCR(use_angle_cls=True,lang='en',det_db_thresh=0, det_db_box_thresh=0,det_db_unclip_ratio=0,det_db_score_mode='fast',use_gpu=True,show_log=False)

    return driver,ocr

In [4]:
def get_stock_dealer(stock_id,driver,ocr):
    try:
        OKFlag=False
        '''
        # 2Captcha API 密鑰
        API_KEY = 'your_2captcha_api_key_here'
        
        options = Options()
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-infobars")
        options.add_argument("--disable-extensions")
        options.add_experimental_option("prefs", {
            "download.default_directory": "f:\\dealer_temp\\",
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        })

        # 設定 Selenium 驅動程式
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)

        # 目標網址
        url = 'https://bsr.twse.com.tw/bshtm/bsMenu.aspx'

        # 開啟網頁
        driver.get(url)
        '''

        # 等待 CAPTCHA 圖片載入（假設我們知道它的 ID 或其他選擇器）
        #captcha_image_element = driver.find_element(By.ID, "captcha_image_id_here")  # 更新為實際的 ID
        captcha_image_element = driver.find_element(By.XPATH, "//*[@id='Panel_bshtm']/table/tbody/tr/td/table/tbody/tr[1]/td/div/div[1]/img")


        # 擷取 CAPTCHA 圖片
        captcha_image_url = captcha_image_element.get_attribute("src")
        img_data = requests.get(captcha_image_url).content
        time.sleep(0.1)
        img = Image.open(BytesIO(img_data))

        # 儲存 CAPTCHA 圖像
        img.save('d:\\captcha.png')

        #paddleOCR識別************************************************************
        tStart = time.time()
    
        # 選擇你要識別的圖片路徑
        img_path = 'D:\captcha.png'
        result = ocr.ocr(img_path)
        captcha_answer=result[0][0][1][0]
        #print(f"識別結果: {captcha_answer.strip()}")
        print("CAPTCHA 解答：", captcha_answer)
        tEnd = time.time()
        print('OCR結束花費:'+str(tEnd - tStart))

        # 在 CAPTCHA 輸入框中輸入解答
        #captcha_input = driver.find_element(By.ID, "CaptchaControl1")  # 更新為實際的 ID
        captcha_input = driver.find_element(By.XPATH, "//*[@id='Panel_bshtm']/table/tbody/tr/td/table/tbody/tr[1]/td/div/div[2]/input")
        captcha_input.send_keys(captcha_answer)

        # 證卷代號框中輸入
        Stkno_input = driver.find_element(By.XPATH, "//*[@id='TextBox_Stkno']")
        Stkno_input.clear()
        Stkno_input.send_keys(stock_id)

        # 提交表單或其他操作
        #submit_button = driver.find_element(By.ID, "btnOK")  # 更新為實際的 ID
        submit_button = driver.find_element(By.XPATH, "//*[@id='btnOK']")
        submit_button.click()

        # 接著繼續爬取資料
        # 等待資料載入，然後抓取資料...
        
        time.sleep(0.1)
        # 繼續抓取資料的部分（和之前的例子相同）
        # driver.page_source、BeautifulSoup 等處理...
        csv_button = driver.find_element(By.XPATH, "//*[@id='HyperLink_DownloadCSV']")
        csv_button.click()
        print("CAPTCHA 解答成功")
        
        # 設定來源資料夾和目標資料夾
        source_folder = "f:\\dealer_temp\\"
        target_folder = "f:\\文義\\股票資料\\股票歷史資料\\個股卷商分點資料\\"+formatted_date+'\\'
        
        # 確認目標資料夾存在，若不存在則建立
        if not os.path.exists(target_folder):
            os.makedirs(target_folder)
            
        # Wait for the file to appear in the download directory        
        csv_time = time.time()
        filename=stock_id+'.csv'
        while not os.path.exists(os.path.join(source_folder, filename)):
            if time.time() - csv_time > 60:
                raise TimeoutError("File not found in download directory")
            time.sleep(0.05)
        
        
        # 遍歷來源資料夾中的所有檔案
        for filename in os.listdir(source_folder):
            source_path = os.path.join(source_folder, filename)
            target_path = os.path.join(target_folder, filename)

            #print("source_path:"+source_path)
            #print("target_path:"+target_path)
            # 確保是檔案而不是資料夾
            if os.path.isfile(source_path):
                # 移動檔案
                shutil.move(source_path, target_path)
                print(f"已移動: {filename}")

        # 關閉瀏覽器
        OKFlag=True
        #driver.quit()
        return OKFlag
    except Exception as e:
        print('get_stock_dealer error:'+str(e.__traceback__.tb_lineno))
        #print('get_stock_dealer error:'+str(e)+str(e.__traceback__.tb_lineno))
        #driver.quit()
        ErrorMsg=driver.find_element(By.XPATH, "//*[@id='Label_ErrorMsg']").text
        print(driver.find_element(By.XPATH, "//*[@id='Label_ErrorMsg']").text)
        time.sleep(0.2)
        if ErrorMsg=='查無資料':
            OKFlag=True
            
        return OKFlag

In [5]:
#身分驗證
from fugle_marketdata import RestClient
client = RestClient(api_key='MmRkMTA5OWUtNjM3OS00YmQ5LTk4ZWMtNTYzOGQyMjY5MWY2IGIyYTNhYWY5LTcxY2ItNGUyMy04NTVhLTU0ZDhkNzE2ZjM1Mw==')
stock = client.stock
#取得股票或指數列表
StockID_dic=stock.intraday.tickers(type='EQUITY', exchange="TWSE", isNormal=True)

#initial web driver & ocr model
driver,ocr=get_stock_initial()

# 設定來源資料夾和目標資料夾
source_folder = "f:\\dealer_temp\\"
target_folder = "f:\\文義\\股票資料\\股票歷史資料\\個股卷商分點資料\\"+formatted_date+'\\'

errcount=0
for i in range(len(StockID_dic['data'])):
    stock_id=StockID_dic['data'][i]['symbol']
    
    #檢查是否已經下載過
    filename=stock_id+'.csv'
    target_path = os.path.join(target_folder, filename)
    if os.path.exists(target_path):continue
    #stock_id='2330'
    
    #開始抓取
    OKFlag=False
    now = datetime.datetime.now()
    start_time_str = now.strftime("%H:%M:%S")
    #print('開始時間:'+time_str)
    tStart = time.time()
    while OKFlag==False:
        OKFlag=get_stock_dealer(stock_id,driver,ocr)
        #錯太多次,重開網頁
        if OKFlag==False:
            errcount=errcount+1
        if errcount>10:
            driver,ocr=get_stock_initial()
        
        #print(OKFlag)
        #清掉errcount重新計算
        if OKFlag==True:
            errcount=0
        
    tEnd = time.time()
    print('結束花費:'+str(tEnd - tStart))

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 15:41:50] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 15:44:21] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model



已移動: 2015.csv


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 15:44:33] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 16:31:45] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 16:31:58] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 16:32:12] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model



已移動: 0061.csv


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\Administrator\.wdm\drivers\chromedriver\win64\131.0.6778.264\chromedriver.exe] found in cache


[2025/01/24 16:32:25] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0, det_db_score_mode='fast', det_db_thresh=0, det_db_unclip_ratio=0, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\Administrator/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model