In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# information of the webpage
gym_webpage = {
    'url'               : 'https://swimpool.nctu.edu.tw/NCTUGym/index.php/anchor/fitness',  # url
    'id_counter'        : 'gym-counter',                                                    # id of the counter
    'id_update_time'    : 'gym-update'                                                      # id of the update time
}
columns = ['Request time', 'Counter', 'Last update time']

# build the directory for the output file
dir_path                = os.path.join(os.getcwd(), 'csv_files')
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# parameters 
delay_between_requests  = 50    # wait for 50 seconds after every request
delay_fetch             = 5     # wait for 5 seconds to fetch the webpage
end_hour                = 22    # end the crawling problem until 22:00               

In [2]:
# Crawling the web-page

cur_time = time.localtime(time.time())                              # current time

while cur_time.tm_hour < end_hour:
    # format strings of current time
    tm_date = '{year}-{mon:02d}-{mday:02d}'.format(
        mon=cur_time.tm_mon, 
        mday=cur_time.tm_mday, 
        year=cur_time.tm_year,
    )
    tm_time = '{hr:02d}:{min:02d}:{sec:02d}'.format(
        hr=cur_time.tm_hour, 
        min=cur_time.tm_min, 
        sec=cur_time.tm_sec,
    )
    tm = tm_date + ' ' + tm_time
    print(f'Attemp to fetch the webpage at: {tm_date} {tm_time}')

    # open Chorme driver to get the webpage 
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install())
    )
    driver.get(gym_webpage['url'])
    time.sleep(delay_fetch)                                         # wait for the browser to get the webpage
    
    # parse the data by id of html elements
    gym_counter     = driver.find_element('id', gym_webpage['id_counter']).text
    gym_update_time  = driver.find_element('id', gym_webpage['id_update_time']).text
    
    # print results out
    print()
    print('Successfully received:')
    print('------------------------------------------------------------')
    print(f'| Counter          \t: {gym_counter}')
    print(f'| Last update time \t: {gym_update_time}')
    print('------------------------------------------------------------')
    
    # write results to csv
    df_dict = dict(zip(
            columns, 
            [[tm], [gym_counter], [gym_update_time]]
        )
    )
    df = pd.DataFrame(df_dict)
    file_path = os.path.join(dir_path, f'output_{tm_date}.csv')
    df.to_csv(
        file_path, mode='a', index=False,                           # appened mode 
        header=not(os.path.exists(file_path))
    )       
    
    # close the driver
    driver.close()
    time.sleep(delay_between_requests)                              # wait for a while before the next request
    cur_time = time.localtime(time.time())                          # reset the current time

Attemp to fetch the webpage at: 2023-03-03 20:48:12

Successfully received:
------------------------------------------------------------
| Counter          	: 45
| Last update time 	: 2023-03-03 20:05:37
------------------------------------------------------------
Attemp to fetch the webpage at: 2023-03-03 20:49:17

Successfully received:
------------------------------------------------------------
| Counter          	: 45
| Last update time 	: 2023-03-03 20:05:37
------------------------------------------------------------
Attemp to fetch the webpage at: 2023-03-03 20:50:22

Successfully received:
------------------------------------------------------------
| Counter          	: 45
| Last update time 	: 2023-03-03 20:05:37
------------------------------------------------------------
Attemp to fetch the webpage at: 2023-03-03 20:51:30

Successfully received:
------------------------------------------------------------
| Counter          	: 45
| Last update time 	: 2023-03-03 20:05:37
-