<a href="https://colab.research.google.com/github/OscarLoOscar/python_Data_Science/blob/main/horse_racing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import shlex
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import time



In [9]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [23]:
class HKJCWebScraper:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--remote-debugging-port=9222")
        self.browser = webdriver.Chrome(options=chrome_options)
        self.base_url = 'https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate='

    def get_dates(self):
        self.browser.get(self.base_url + '2024/05/29')  # Example date to access the page
        try:
            # Wait for the select element to be present
            select_element = WebDriverWait(self.browser, 20).until(
                EC.presence_of_element_located((By.ID, "selectId"))
            )
            options = WebDriverWait(self.browser, 20).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "option"))
            )
            dates = [option.get_attribute("value") for option in options]
            return dates
        except Exception as e:
            print(f"An error occurred while fetching dates: {e}")
            return []

    def scrape_data_for_date(self, date):
        self.browser.get(self.base_url + date)
        time.sleep(3)  # Wait for page to load
        soup = BeautifulSoup(self.browser.page_source, 'html.parser')

        # Extract data from race_tab
        race_tab_data = []
        race_tab = soup.find('table', class_='race_tab')
        if race_tab:
            rows = race_tab.find_all('tr')[1:]  # Skip the header row
            for row in rows:
                race_tab_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # Extract data from performance table
        performance_data = []
        performance_table = soup.find('table', class_='f_tac table_bd draggable')
        if performance_table:
            rows = performance_table.find_all('tr')
            for row in rows:
                performance_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # Extract data from dividend_tab
        dividend_data = []
        dividend_table = soup.find('table', class_='table_bd f_tac f_fs13 f_fl')
        if dividend_table:
            rows = dividend_table.find_all('tr')
            for row in rows:
                dividend_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        return race_tab_data, performance_data, dividend_data

    def close(self):
        self.browser.quit()

scraper = HKJCWebScraper()


In [24]:
all_data = []
dates = scraper.get_dates()
for date in dates:
    print(f"Scraping data for {date}")
    try:
        data = scraper.scrape_data_for_date(date)
        all_data.append({date: data})
    except Exception as e:
        print(f"Failed to scrape data for {date}: {e}")


Scraping data for 02/06/2024
Scraping data for 01/06/2024
Scraping data for 29/05/2024
Scraping data for 26/05/2024
Scraping data for 22/05/2024
Scraping data for 19/05/2024
Scraping data for 18/05/2024
Scraping data for 15/05/2024
Scraping data for 12/05/2024
Scraping data for 11/05/2024
Scraping data for 08/05/2024
Scraping data for 05/05/2024
Scraping data for 04/05/2024
Scraping data for 01/05/2024
Scraping data for 28/04/2024
Scraping data for 24/04/2024
Scraping data for 20/04/2024
Scraping data for 17/04/2024
Scraping data for 14/04/2024
Scraping data for 13/04/2024
Scraping data for 10/04/2024
Scraping data for 07/04/2024
Scraping data for 06/04/2024
Scraping data for 03/04/2024
Scraping data for 31/03/2024
Scraping data for 30/03/2024
Scraping data for 27/03/2024
Scraping data for 24/03/2024
Scraping data for 20/03/2024
Scraping data for 16/03/2024
Scraping data for 13/03/2024
Scraping data for 10/03/2024
Scraping data for 06/03/2024
Scraping data for 03/03/2024
Scraping data 

In [25]:
input_data = [
    [
        '30/10/2022',       # Date
        '星期日',            # Day of the week
        '16:00',            # Time
        '第五班',            # Class
        '2000米',            # Distance
        '草地',              # Track
        '沙田海讓賽',        # Race name
        'B',                # Bet calculator
        '每注金額 $',        # Bet amount
        # Horse information
        [1, 'H486', '竣誠駒', 12, 135, '鍾易禮', '告東尼', '', '', ''],
        [2, 'H083', '滿載歸來', 13, 133, '布文', '容天鵬', '', '', ''],
        [3, 'E166', '樂天派', 3, 132, '董明朗', '大衛希斯', '', '', ''],
        [4, 'E025', '怡昌勇士', 6, 131, '田泰安', '賀賢', '', '', ''],
        [5, 'G295', '神舟飛駒', 9, 129, '班德禮', '韋達', '', '', ''],
        [6, 'H196', '上市魅力', 5, 129, '艾兆禮', '蘇偉賢', '', '', ''],
        [7, 'G322', '國大合', 1, 129, '希威森', '廖康銘', '', '', ''],
        [8, 'D235', '爸巴閉', 8, 123, '湯普新', '徐雨石', '', '', ''],
        [9, 'E194', '符號', 2, 122, '巴度', '徐雨石', '', '', ''],
        [10, 'G072', '喜悅一生', 4, 121, '巫顯東', '鄭俊偉', '', '', ''],
        [11, 'H040', '小鳥', 10, 117, '蔡明紹', '葉楚航', '', '', ''],
        [12, 'H285', '鑽石福將', 11, 115, '潘明輝', '蔡約翰', '', '', ''],
        [13, 'E409', '綠登', 7, 115, '楊明綸', '蘇偉賢', '', '', '']
    ],
    # Add more race data as needed
]

In [26]:
# Assuming input_data is structured and ready for analysis
# Placeholder for analysis code
def analyze_data(all_data, input_data):
    # Print the input data for reference
    print("Input Data:")
    for race in input_data:
        print(race)
    # Implement your analysis logic here
    pass

# Call the analyze_data function with all_data and input_data
analyze_data(all_data, input_data)


Input Data:
['30/10/2022', '星期日', '16:00', '第五班', '2000米', '草地', '沙田海讓賽', 'B', '每注金額 $', [1, 'H486', '竣誠駒', 12, 135, '鍾易禮', '告東尼', '', '', ''], [2, 'H083', '滿載歸來', 13, 133, '布文', '容天鵬', '', '', ''], [3, 'E166', '樂天派', 3, 132, '董明朗', '大衛希斯', '', '', ''], [4, 'E025', '怡昌勇士', 6, 131, '田泰安', '賀賢', '', '', ''], [5, 'G295', '神舟飛駒', 9, 129, '班德禮', '韋達', '', '', ''], [6, 'H196', '上市魅力', 5, 129, '艾兆禮', '蘇偉賢', '', '', ''], [7, 'G322', '國大合', 1, 129, '希威森', '廖康銘', '', '', ''], [8, 'D235', '爸巴閉', 8, 123, '湯普新', '徐雨石', '', '', ''], [9, 'E194', '符號', 2, 122, '巴度', '徐雨石', '', '', ''], [10, 'G072', '喜悅一生', 4, 121, '巫顯東', '鄭俊偉', '', '', ''], [11, 'H040', '小鳥', 10, 117, '蔡明紹', '葉楚航', '', '', ''], [12, 'H285', '鑽石福將', 11, 115, '潘明輝', '蔡約翰', '', '', ''], [13, 'E409', '綠登', 7, 115, '楊明綸', '蘇偉賢', '', '', '']]


In [40]:
all_data_df = pd.DataFrame(all_data)
print(all_data);

[{'02/06/2024': ([], [['名次', '馬號', '馬名', '騎師', '練馬師', '實際負磅', '排位體重', '檔位', '頭馬距離', '沿途走位', '完成時間', '獨贏賠率'], ['1', '4', '魅影獵飛(G317)', '田泰安', '姚本輝', '119', '1147', '5', '-', '5551', '1:40.13', '1.7'], ['2', '5', '黃腳鱲(G326)', '希威森', '呂健威', '119', '1210', '4', '1-1/4', '4422', '1:40.33', '3.5'], ['3', '1', '電訊巴打(D482)', '布文', '徐雨石', '135', '1115', '1', '1-3/4', '1113', '1:40.43', '6.8'], ['4', '2', '魅力寶駒(G213)', '巴度', '蔡約翰', '133', '1182', '3', '2-3/4', '3334', '1:40.59', '7.3'], ['5', '3', '保羅承傳(C517)', '潘明輝', '羅富全', '130', '1101', '2', '3-3/4', '2245', '1:40.73', '15']], [['派彩'], ['彩池', '勝出組合', '派彩 (HK$)'], ['獨贏', '4', '17.50'], ['位置', '4', '13.00'], ['5', '16.50'], ['連贏', '4,5', '21.50'], ['二重彩', '4,5', '37.00'], ['三重彩', '4,5,1', '66.00'], ['單T', '1,4,5', '27.00']])}, {'01/06/2024': ([], [], [])}, {'29/05/2024': ([], [['名次', '馬號', '馬名', '騎師', '練馬師', '實際負磅', '排位體重', '檔位', '頭馬距離', '沿途走位', '完成時間', '獨贏賠率'], ['1', '4', '魅影獵飛(G317)', '田泰安', '姚本輝', '119', '1147', '5', '-', '5551', '1:40.13', 

In [42]:
import pandas as pd
import numpy as np

# 1. 提取并转换数据为DataFrame
data = []
for race_day in all_data:
    for date, (race_tab, performance_data, dividend_data) in race_day.items():
        for race in performance_data[1:]:  # 跳过标题行
            if len(race) == 12:  # 确保行中有12个元素
                data.append({
                    '日期': date,
                    '名次': race[0],
                    '馬號': race[1],
                    '馬名': race[2],
                    '騎師': race[3],
                    '練馬師': race[4],
                    '實際負磅': race[5],
                    '排位體重': race[6],
                    '檔位': race[7],
                    '頭馬距離': race[8],
                    '沿途走位': race[9],
                    '完成時間': race[10],
                    '獨贏賠率': race[11]
                })

df = pd.DataFrame(data)

df['排位體重'] = pd.to_numeric(df['排位體重'], errors='coerce')
df['完成時間'] = pd.to_numeric(df['完成時間'].str.replace(':', ''), errors='coerce')


# 2. 分组并分析数据
grouped_data = df.groupby(['馬名', '騎師', '實際負磅', '排位體重', '檔位'])
# print(grouped_data)
average_completion_time = grouped_data['完成時間'].mean()

# 打印结果
# print(average_completion_time)

# 其他分析操作，例如：
# 获取每个组的计数
group_counts = grouped_data.size()
# print(group_counts)

# 获取独赢赔率的平均值
df['獨贏賠率'] = pd.to_numeric(df['獨贏賠率'], errors='coerce')
average_odds = grouped_data['獨贏賠率'].mean()
# print(average_odds)

# 您可以根据需要继续添加其他分析代码。


馬名          騎師   實際負磅  排位體重    檔位
一支箭(H276)   田泰安  128   1007.0  7       4.3
            霍宏聲  123   1013.0  6      15.0
一澤千金(G293)  潘明輝  120   953.0   9      27.0
                       967.0   10    114.0
            黃皓楠  128   960.0   13    138.0
                                     ...  
龍之心(G469)   蔡明紹  126   1244.0  12     16.0
龍城金將(H413)  周俊樂  123   1164.0  3      24.0
龍東傳承(E158)  蔡明紹  134   1227.0  8      20.0
龍的風采(H029)  布文   131   1079.0  1       8.3
龍船快(G229)   何澤堯  135   1109.0  8      11.0
Name: 獨贏賠率, Length: 1898, dtype: float64


In [47]:
import pandas as pd

# 示例的 input_data
input_data = [
    [
        '30/10/2022',  # Date
        '星期日',  # Day of the week
        '16:00',  # Time
        '第五班',  # Class
        '2000米',  # Distance
        '草地',  # Track
        '沙田海讓賽',  # Race name
        'B',  # Bet calculator
        '每注金額 $',  # Bet amount
        # Horse information
        [1, 'H486', '竣誠駒', 12, 135, '鍾易禮', '告東尼', '', '', ''],
        [2, 'H083', '滿載歸來', 13, 133, '布文', '容天鵬', '', '', ''],
        [3, 'E166', '樂天派', 3, 132, '董明朗', '大衛希斯', '', '', ''],
        [4, 'E025', '怡昌勇士', 6, 131, '田泰安', '賀賢', '', '', ''],
        [5, 'G295', '神舟飛駒', 9, 129, '班德禮', '韋達', '', '', ''],
        [6, 'H196', '上市魅力', 5, 129, '艾兆禮', '蘇偉賢', '', '', ''],
        [7, 'G322', '國大合', 1, 129, '希威森', '廖康銘', '', '', ''],
        [8, 'D235', '爸巴閉', 8, 123, '湯普新', '徐雨石', '', '', ''],
        [9, 'E194', '符號', 2, 122, '巴度', '徐雨石', '', '', ''],
        [10, 'G072', '喜悅一生', 4, 121, '巫顯東', '鄭俊偉', '', '', ''],
        [11, 'H040', '小鳥', 10, 117, '蔡明紹', '葉楚航', '', '', ''],
        [12, 'H285', '鑽石福將', 11, 115, '潘明輝', '蔡約翰', '', '', ''],
        [13, 'E409', '綠登', 7, 115, '楊明綸', '蘇偉賢', '', '', '']
    ]
    # 可以添加更多的比赛数据
]

# 1. 提取并转换数据为DataFrame
data = []
for race in input_data:
    date = race[0]
    for horse in race[9:]:  # 提取每匹马的信息
        data.append({
            '日期': date,
            '名次': horse[0],
            '馬號': horse[1],
            '馬名': horse[2],
            '騎師': horse[5],
            '練馬師': horse[6],
            '實際負磅': horse[4],
            '排位體重': horse[3],
            '檔位': horse[7]
        })

df = pd.DataFrame(data)

# 2. 计算每匹马的总参赛次数和获胜次数
total_races = df.groupby('馬名').size()
print(total_races)
wins = df[df['名次'] == 1].groupby('馬名').size()
print(wins)

# 3. 计算每匹马的胜率
win_rate = (wins / total_races).fillna(0)  # 填充NaN为0，表示无获胜记录

# 4. 找出胜率最高的4匹马
top_4_horses = win_rate.sort_values(ascending=False).head(4)

# 打印结果
print("胜率最高的4匹马:")
print(top_4_horses)

# 将结果转换为DataFrame并打印
result_df = top_4_horses.reset_index()
result_df.columns = ['馬名', '勝率']
print(result_df)


馬名
上市魅力    1
喜悅一生    1
國大合     1
小鳥      1
怡昌勇士    1
樂天派     1
滿載歸來    1
爸巴閉     1
神舟飛駒    1
竣誠駒     1
符號      1
綠登      1
鑽石福將    1
dtype: int64
馬名
竣誠駒    1
dtype: int64
胜率最高的4匹马:
馬名
竣誠駒     1.0
上市魅力    0.0
喜悅一生    0.0
國大合     0.0
dtype: float64
     馬名   勝率
0   竣誠駒  1.0
1  上市魅力  0.0
2  喜悅一生  0.0
3   國大合  0.0


In [46]:
input_data = [
     [
        '02/06/2024',  # Date
        '星期日',  # Day of the week
        '16:30',  # Time
        '第二班',  # Class
        '1400米',  # Distance
        '草地',  # Track
        '彭福公園讓賽',  # Race name
        'B',  # Bet calculator
        '每注金額 $',  # Bet amount
        # Horse information
        [1, 'G227', '一先生', 3, 135, '班德禮', '姚本輝', '', '', ''],
        [2, 'G455', '傑出漢子', 2, 127, '希威森', '呂健威', '', '', ''],
        [3, 'G457', '旺旺神駒', 1, 126, '潘明輝', '沈集成', '', '', ''],
        [4, 'G356', '駿馬快車', 5, 123, '艾道拿', '姚本輝', '', '', ''],
        [5, 'G335', '巴閉哥', 4, 122, '何澤堯', '呂健威', '', '', ''],
        [6, 'G306', '瑪瑙', 6, 119, '田泰安', '羅富全', '', '', '']
    ]
    # 可以添加更多的比赛数据
]

# 1. 提取并转换数据为DataFrame
data = []
for race in input_data:
    date = race[0]
    for horse in race[9:]:  # 提取每匹马的信息
        data.append({
            '日期': date,
            '名次': horse[0],
            '馬號': horse[1],
            '馬名': horse[2],
            '騎師': horse[5],
            '練馬師': horse[6],
            '實際負磅': horse[4],
            '排位體重': horse[3],
            '檔位': horse[7]
        })

df = pd.DataFrame(data)

# 2. 计算每匹马的总参赛次数和获胜次数
total_races = df.groupby('馬名').size()
wins = df[df['名次'] == 1].groupby('馬名').size()

# 3. 计算每匹马的胜率
win_rate = (wins / total_races).fillna(0)  # 填充NaN为0，表示无获胜记录

# 4. 找出胜率最高的4匹马
top_4_horses = win_rate.sort_values(ascending=False).head(4)

# 打印结果
print("胜率最高的4匹马:")
print(top_4_horses)

# 将结果转换为DataFrame并打印
result_df = top_4_horses.reset_index()
result_df.columns = ['馬名', '勝率']
print(result_df)

胜率最高的4匹马:
馬名
一先生     1.0
傑出漢子    0.0
巴閉哥     0.0
旺旺神駒    0.0
dtype: float64
     馬名   勝率
0   一先生  1.0
1  傑出漢子  0.0
2   巴閉哥  0.0
3  旺旺神駒  0.0
