In [1]:
"""
Pick the version of chrome that is supported
for 123-125:
https://googlechromelabs.github.io/chrome-for-testing/
for 122:
https://storage.googleapis.com/chrome-for-testing-public/122.0.6261.57/win64/chromedriver-win64.zip
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chromium.webdriver import ChromiumDriver
import pandas as pd
import json, time, pickle

URL = 'https://racing.turfclub.com.sg/en/horse-performance/'
driver = webdriver.Chrome() #choose chrome (you can choose anything)
wait = WebDriverWait(driver, 10)  # Using explicit wait with a timeout of 10 seconds
def get_links(driver:ChromiumDriver = driver, wait:WebDriverWait=wait, *, json_file:bool = False, save_df:bool = False, verbose:bool=False) -> pd.DataFrame:
    driver.get(URL)
    horses = {}
    while True: #page traversal
        time.sleep(3) #literally wait for the browser to load
        all_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.odd td.sorting_1 a, .even td.sorting_1 a')) )#selenium gets confused when elements are finished loading due to the way singapore turf loads their content
        for link in all_links:
            try:
                horse_name = link.text.strip()
                horse_link = link.get_attribute('href')
                if horse_name:
                    horses[horse_name] = horse_link
            except StaleElementReferenceException as e:
                continue
        all_links.clear()
        if len(driver.find_elements(By.CLASS_NAME, 'paginate_button.next.disabled')) > 0: break #check if last page
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#DataTables_Table_0_next'))).click()
    driver.delete_all_cookies()
    driver.quit() #close the driver gracefully

    if verbose:
        for horse_name, horse_links in horses.items():
            print(horse_name, horse_links)

    # Write data to JSON file
    if json_file:
        with open('horse_data.json', 'w') as json_f:
            json.dump(horses, json_f, indent=4)

    #create dataframe and save the object with pickle
    horses_df = pd.DataFrame({'Horse Name': list(horses.keys()), 'Horse Links': horses.values()})
    if save_df:
        with open('horse_data.pickle', 'wb') as f:
            pickle.dump(horses_df, f) #store the pickle file for future use

    return horses_df

if __name__ == '__main__':
    horses_df = get_links(save_df=True, json_file=True, verbose=True)

KeyboardInterrupt: 

In [None]:
# Read the JSON file
with open('horse_data.json', 'r') as file:
    horse_data:dict = json.load(file)


In [None]:
"""

Predictors:
    Bar: Barrier
    C.Wt: Carried Weight
    Dist: Distance
    H.Wt: Horse Weight
    G: Going
    T: Track
    Jockey: Jockey
    Trainer: Trainer
    Finish Time: Finish Time

    Date Foaled: Age
"""
from selenium.webdriver.chrome.options import Options

# Configure Chrome options for headless mode
options = Options()
options.headless = True

# Set up Selenium WebDriver with Chrome
driver = webdriver.Chrome(options=options)  
wait = WebDriverWait(driver, 10)  # Using explicit wait with a timeout of 10 seconds
df_list=[]

# Define the base URL and the number of pages
for name,url in horse_data.items():
    driver.get(url)
    
    # Wait until the table is present on the page
    while True: #page traversal
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'table'))) #dont really need it
        time.sleep(4) #literally wait for the browser to load
        all_rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.odd, .even')) )#selenium gets confused when elements are finished loading due to the way singapore turf loads their content
        for row in all_rows:
            try:
                cells = row.find_elements(By.TAG_NAME, 'td')
                if len(cells) < 18:  # Adjust the number as per your requirement
                    continue  # Skip processing this row
                row_data = {
                    'Barrier': cells[8].text.strip(),
                    'Carried Weight': cells[9].text.strip(),
                    'Distance': cells[5].text.strip(),
                    'Rating': cells[4].text.strip(),
                    'Horse Weight': cells[10].text.strip(),
                    'Going': cells[7].text.strip(),
                    'Track': cells[6].text.strip(),
                    'Jockey': cells[16].text.strip(),
                    'Trainer': cells[17].text.strip(),
                    'Finish Time': cells[13].text.strip(),
                }
                with open(f'horse_profiles.json', 'a') as f:
                    json.dump({name: row_data}, f)
                    f.write('\n')
            except StaleElementReferenceException as e:
                continue
        all_rows.clear()
        if len(driver.find_elements(By.CLASS_NAME, 'paginate_button.next.disabled')) > 0: break #check if last page
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#DataTables_Table_0_next'))).click()
# Concatenate all DataFrames in the list
df = pd.concat(df_list, ignore_index=True)
with open('horse_data.pickle', 'wb') as f:
    pickle.dump(horses_df, f) #store the pickle file for future use
# Close the WebDriver
driver.delete_all_cookies()
driver.quit()