In [None]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import shlex
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from google.colab import drive
import numpy as np
import tensorflow as tf
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import re

In [None]:
drive.mount('/content/drive')

In [None]:
class HKJCWebScraper:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--remote-debugging-port=9222")
        self.browser = webdriver.Chrome(options=chrome_options)
        self.base_url = 'https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate='

    def get_dates(self):
        self.browser.get(self.base_url + '2024/05/29')  # Example date to access the page
        try:
            # Wait for the select element to be present
            select_element = WebDriverWait(self.browser, 20).until(
                EC.presence_of_element_located((By.ID, "selectId"))
            )
            options = WebDriverWait(self.browser, 20).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "option"))
            )
            dates = [option.get_attribute("value") for option in options]
            return dates
        except Exception as e:
            print(f"An error occurred while fetching dates: {e}")
            return []

    def scrape_data_for_date(self, date):
        self.browser.get(self.base_url + date)
        time.sleep(3)  # Wait for page to load
        soup = BeautifulSoup(self.browser.page_source, 'html.parser')

        # Extract data from race_tab
        race_tab_data = []
        race_tab = soup.find('table', class_='race_tab')
        if race_tab:
            rows = race_tab.find_all('tr')[1:]  # Skip the header row
            for row in rows:
                race_tab_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # Extract data from performance table
        performance_data = []
        performance_table = soup.find('table', class_='f_tac table_bd draggable')
        if performance_table:
            rows = performance_table.find_all('tr')
            for row in rows:
                performance_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # Extract data from dividend_tab
        # dividend_data = []
        # dividend_table = soup.find('table', class_='table_bd f_tac f_fs13 f_fl')
        # if dividend_table:
        #     rows = dividend_table.find_all('tr')
        #     for row in rows:
        #         dividend_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # return race_tab_data, performance_data, dividend_data
        return race_tab_data, performance_data

    def close(self):
        self.browser.quit()

scraper = HKJCWebScraper()


In [None]:
# all_data = []
# dates = scraper.get_dates()
# for date in dates:
#     print(f"Scraping data for {date}")
#     try:
#         data = scraper.scrape_data_for_date(date)
#         all_data.append({date: data})
#     except Exception as e:
#         print(f"Failed to scrape data for {date}: {e}")

# file_path = '/content/drive/My Drive/horse_data.xlsx'
# all_data.to_excel(file_path, index=False)
def parse_data(all_data):
    data_list = []
    for race in all_data:
        for date, (meta, race_results) in race.items():
            if not race_results:  # 檢查 race_results 是否為空
                print(f"No race results for {date}")
                continue
            
            columns = race_results[0]
            for result in race_results[1:]:
                if len(result) != len(columns):
                    print(f"Mismatch in columns and data for {date}: {result}")
                    continue
                
                race_dict = {col: val for col, val in zip(columns, result)}
                race_dict['date'] = date
                data_list.append(race_dict)
    
    return pd.DataFrame(data_list)

df = parse_data(all_data)

# 保存為Excel文件
file_path = '/content/drive/My Drive/horse_data.xlsx'
df.to_excel(file_path, index=False)



In [None]:
# 讀取數據
file_path = '/content/drive/My Drive/horse_data.xlsx'
df = pd.read_excel(file_path)

# 顯示前幾行數據
print(df.head())


In [None]:
# 過濾不含數字的'名次'數據
df = df[pd.to_numeric(df['名次'], errors='coerce').notnull()]
# 過濾和清理'名次'列
df['名次'] = df['名次'].apply(lambda x: re.sub(r'\D', '', str(x)))  # 去掉非数字字符
df = df[df['名次'] != '']  # 过滤掉空字符串
df['名次'] = df['名次'].astype(int)  # 将'名次'转换为整数

# 處理無效值
df['實際負磅'] = pd.to_numeric(df['實際負磅'], errors='coerce')
df['排位體重'] = pd.to_numeric(df['排位體重'], errors='coerce')
df['檔位'] = pd.to_numeric(df['檔位'], errors='coerce')
df['獨贏賠率'] = pd.to_numeric(df['獨贏賠率'], errors='coerce')
df['完成時間'] = df['完成時間'].apply(lambda x: float(x.split(':')[0]) * 60 + float(x.split(':')[1]) if isinstance(x, str) else None)

# 刪除含有 NaN 的行
df.dropna()

# 將數值類型轉換為整數
df['實際負磅'] = df['實際負磅'].astype(int)
df['排位體重'] = df['排位體重'].astype(int)
df['檔位'] = df['檔位'].astype(int)
df['獨贏賠率'] = df['獨贏賠率'].astype(float)

# 將類別特徵轉換為數值特徵
label_encoders = {}
for column in ['馬號', '馬名', '騎師', '練馬師', '實際負磅', '排位體重', '檔位', '完成時間', '獨贏賠率']:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# 選擇特徵和標籤
features = ['馬號', '馬名', '騎師', '練馬師', '實際負磅', '排位體重', '檔位', '完成時間', '獨贏賠率']
X = df[features]
y = df['名次']



# 標籤二值化（假設要預測名次為1、2、3、4的馬匹）
y = (y <= 4).astype(int)

# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 標準化數據
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

# 建立BP神經網絡模型
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu')) # 增加第一層神經元數量
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu')) # 增加第二層神經元數量
model.add(Dense(32, activation='relu')) # 添加第三層
model.add(Dense(1, activation='sigmoid'))

# 更改激活函數
# model = Sequential()
# model.add(Dense(64, input_dim=X_train.shape[1], activation='tanh'))  # 使用tanh激活函數
# model.add(Dense(32, activation='tanh'))
# model.add(Dense(1, activation='sigmoid'))


optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# 訓練模型
model.fit(X_train, y_train, epochs=2000, batch_size=50, validation_split=0.2)
# epochs：訓練過程中的 epoch 數量，即模型將完整遍歷訓練集的次數。

# batch_size：每個批次（batch）中包含的樣本數量。在每個 epoch 中，訓練數據將被劃分為多個批次，每個批次中的樣本將被用於更新模型的權重。選擇適當的批次大小可以影響訓練速度和模型的性能。

# validation_split：用於驗證的訓練集的比例。例如，如果設置為0.2，則將訓練集的20％用於驗證，而80％用於訓練。這個驗證集將用於每個 epoch 結束時計算模型的驗證損失和指標。

# 評估模型
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy : ")
print(f'Accuracy: {accuracy}')

# 預測
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)


In [None]:

# 顯示預測結果
for i in range(10):
    print(f'Actual: {y_test.iloc[i]}, Predicted: {predictions[i][0]}')


In [None]:
# Provided input data with only relevant features
# '馬號', '馬名', '騎師', '練馬師', '實際負磅', '排位體重', '檔位', '完成時間', '獨贏賠率'
input_data = [
    [1, 'H486', '竣誠駒', 12, 135, '鍾易禮', '告東尼', 1165, 40, 'TT'],
    [2, 'H083', '滿載歸來', 13, 133, '布文', '容天鵬', 1215, 38, 'B/TT'],
    [3, 'E166', '樂天派', 3, 132, '董明朗', '大衛希斯', 1132, 37, 'CP2'],
    [4, 'E025', '怡昌勇士', 6, 131, '田泰安', '賀賢', 1101, 36, 'CP/XB'],
    [5, 'G295', '神舟飛駒', 9, 129, '班德禮', '韋達', 1058, 34, 'B/TT'],
    [6, 'H196', '上市魅力', 5, 129, '艾兆禮', '蘇偉賢', 1092, 34, 'P/TT'],
    [7, 'G322', '國大合', 1, 129, '希威森', '廖康銘', 1054, 34, 'B'],
    [8, 'D235', '爸巴閉', 8, 123, '湯普新', '徐雨石', 1095, 28, 'TT-'],
    [9, 'E194', '符號', 2, 122, '巴度', '徐雨石', 997, 27, ''],
    [10, 'G072', '喜悅一生', 4, 121, '巫顯東 ', '鄭俊偉', 1154, 26, 'B/TT'],
    [11, 'H040', '小鳥', 10, 117, '蔡明紹', '葉楚航', 1011, 22, 'B'],
    [12, 'H285', '鑽石福將', 11, 115, '潘明輝', '蔡約翰', 1118, 17, ''],
    [13, 'E409', '綠登', 7, 115, '楊明綸', '蘇偉賢', 1112, 16, 'PC/TT']
]
# Convert input data to DataFrame with only relevant features
input_df = pd.DataFrame(input_data, columns=['馬號','綵衣','馬名','檔位','負磅','騎師','練馬師','馬匹體重','評分','配備'])

# Convert categorical features to one-hot encoding
input_df = pd.get_dummies(input_df, columns=['馬名', '檔位', '負磅', '騎師', '練馬師','馬匹體重'])
for column, dtype in input_df.dtypes.items():
    print(f'{column}: {dtype}')


# 將非數字類型的特徵名稱從 DataFrame 中刪除
input_df = input_df.drop(columns=['綵衣', '配備'])

# 將獨熱編碼應用於對象類型的特徵
# input_df = pd.get_dummies(input_df, columns=['綵衣', '配備'])

# Make predictions
predictions = model.predict(input_df)

# Print the predictions
for i, prediction in enumerate(predictions, start=1):
     print(f'Rank {i}: Horse {input_df.loc[i-1, "Feature3"]} (Probability: {prediction[0]})')


Old Model

In [None]:

def process_all_data(all_data):
    run_list = []

    for race_data in all_data:
        for date, (empty_list, horse_info) in race_data.items():
            # horse_info contains relevant race details
            for horse in horse_info[1:]:  # Skip the header row
              if len(horse)>=12:
                run_list.append([
                    date,                # race_id as date
                    horse[1],            # horse_id
                    horse[0],            # position
                    horse[3],            # jockey
                    horse[4],            # trainer
                    horse[5],            # actual weight
                    horse[6],            # declared weight
                    horse[7],            # draw
                    horse[10],           # finish time
                    horse[11],           # win odds
                ])
              else:
                  print(f"Skipping incomplete entry for date {date}: {horse}")
    # Convert lists to DataFrame
    runs_df = pd.DataFrame(run_list, columns=[
        'race_id', 'horse_id', 'position', 'jockey', 'trainer', 'actual_weight',
        'declared_weight', 'draw', 'finish_time', 'win_odds'
    ])

    return runs_df
# Process all_data
runs_df = process_all_data(all_data)

# Display the DataFrame
print(runs_df.head())

In [None]:
# Select relevant columns and drop NaN values
runs_df = runs_df[['race_id', 'horse_id','position', 'jockey', 'trainer', 'actual_weight', 'declared_weight', 'draw', 'finish_time','win_odds']]
runs_df = runs_df.dropna()

# Filter out rows with non-numeric 'draw' values
runs_df = runs_df[pd.to_numeric(runs_df['draw'], errors='coerce').notnull()]

# Convert 'draw' column to integers
runs_df['draw'] = runs_df['draw'].astype(int)

# Filter out strange draw values
strange_draw_index = runs_df[runs_df['draw'] > 14].index
runs_df = runs_df.drop(strange_draw_index)

# Encode categorical columns: jockey and trainer
jockey_encoder = preprocessing.LabelEncoder()
runs_df['jockey'] = jockey_encoder.fit_transform(runs_df['jockey'])
trainer_encoder = preprocessing.LabelEncoder()
runs_df['trainer'] = trainer_encoder.fit_transform(runs_df['trainer'])

print(runs_df)


In [None]:

# Define the DataFrame preparation steps
def prepare_dataframe(df):
    # Select relevant columns and drop NaN values
    relevant_columns = ['race_id', 'horse_id', 'position', 'draw']
    df = df[relevant_columns]
    df = df.dropna()

    # Filter out rows with non-numeric 'draw' values
    df = df[pd.to_numeric(df['draw'], errors='coerce').notnull()]

    # Convert 'draw' column to integers
    df['draw'] = df['draw'].astype(int)

    # Filter out strange draw values
    df = df[df['draw'] <= 14]

    return df

# Prepare the DataFrame
runs_df = prepare_dataframe(runs_df)

# Pivot the DataFrame
runs_df_pivot = runs_df.pivot(index='race_id', columns='draw', values='position')

# Fill NaNs with 0
runs_df_filled = runs_df_pivot.fillna(0)

print(runs_df_filled.head())

In [None]:
# Ensure the columns containing target variables are converted to numeric data type
data[data.columns[-14:]] = data[data.columns[-14:]].astype(float)

# Clean the data to remove non-numeric strings
data = data[data[data.columns[-14:]].applymap(lambda x: isinstance(x, (int, float)))]

# Separate features (X) and target (y)
X = data.drop(data.columns[-14:], axis=1)  # Exclude the last 14 columns which are the target variables
y = data[data.columns[-14:]].applymap(lambda x: 1.0 if 0.5 < float(x) < 1.5 else 0.0)

# Exclude 'race_id' column from features
X = X.drop('race_id', axis=1)

# Standardize the features
ss = preprocessing.StandardScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)

print("X shape:", X.shape)
print("y shape:", y.shape)

# Split data into train and test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import Precision

In [None]:

# Replace NaN and infinite values with zeros
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

# Define and compile the model
model = Sequential([
    Dense(96, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(14, activation='softmax')
])
model.compile(optimizer=Adam(5e-04),
              loss=CategoricalCrossentropy(),
              metrics=[Precision(name='precision')])

In [None]:
import tensorflow as tf

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(len(X_train)).batch(500)

validation_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
validation_dataset = validation_dataset.shuffle(len(X_test)).batch(500)

# Train the model
print("Start training...\n")
history = model.fit(train_dataset, epochs=200, validation_data=validation_dataset)
print("Training completed.")


In [None]:
precision = history.history['precision']
val_precision = history.history['val_precision']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(precision) + 1)

plt.plot(epochs, precision, 'b', label='Training precision')
plt.plot(epochs, val_precision, 'r', label='Validation precision')
plt.title('Training and validation precision')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Provided input data
input_data = [
    '第1場02/06/2024', '星期日', '16:00', '第五班', '2000米', '草地', '沙田海讓賽', "B",
    # Horse information
    [1, 'H486', '竣誠駒', 12, 135, '鍾易禮 (-3)', '告東尼', 1165, 40, 'TT', [5, 9, 14, 10, 12, 11]],
    [2, 'H083', '滿載歸來', 13, 133, '布文', '容天鵬', 1215, 38, 'B/TT', [10, 7, 7, 10, 3, 2]],
    [3, 'E166', '樂天派', 3, 132, '董明朗', '大衛希斯', 1132, 37, 'CP2', [10, 9, 11, 10, 12, 7]],
    [4, 'E025', '怡昌勇士', 6, 131, '田泰安', '賀賢', 1101, 36, 'CP/XB', [6, 8, 3, 11, 9, 9]],
    [5, 'G295', '神舟飛駒', 9, 129, '班德禮', '韋達', 1058, 34, 'B/TT', [3, 8, 12, 10, 10, 6]],
    [6, 'H196', '上市魅力', 5, 129, '艾兆禮', '蘇偉賢', 1092, 34, 'P/TT', [2, 2, 11, 6, 7, 8]],
    [7, 'G322', '國大合', 1, 129, '希威森', '廖康銘', 1054, 34, 'B', [4, 2, 10, 11, 10, 4]],
    [8, 'D235', '爸巴閉', 8, 123, '湯普新', '徐雨石', 1095, 28, 'TT-', [12, 7, 8, 2, 5, 10]],
    [9, 'E194', '符號', 2, 122, '巴度', '徐雨石', 997, 27, '', [1, 4, 5, 7, 7, 5]],
    [10, 'G072', '喜悅一生', 4, 121, '巫顯東 (-2)', '鄭俊偉', 1154, 26, 'B/TT', [2, 1, 4, 5, 10, 5]],
    [11, 'H040', '小鳥', 10, 117, '蔡明紹', '葉楚航', 1011, 22, 'B', [10, 10, 10, 12, 7, 9]],
    [12, 'H285', '鑽石福將', 11, 115, '潘明輝 (-2)', '蔡約翰', 1118, 17, '', [8, 6, 5, 7, 7, 6]],
    [13, 'E409', '綠登', 7, 115, '楊明綸', '蘇偉賢', 1112, 16, 'PC/TT', [6, 7, 3, 4, 9, 4]]
]
# Extract all relevant features for prediction
horse_features = []
for horse_info in input_data[8:]:
    horse_features.append([
        horse_info[3],       # Position
        horse_info[4],       # Actual weight
        len(horse_info[10])  # Number of previous runs
    ])

# Reshape input features to match the expected shape of the model
input_features_reshaped = np.array(horse_features)  # Convert to numpy array

# Make predictions using the trained model
predictions = model.predict(input_features_reshaped)

# Assuming the first column represents the probability of winning
horse_probabilities = predictions[0]

# Sort the horses based on their probabilities
sorted_indices = np.argsort(horse_probabilities)[::-1]

# Get the top 4 horses with the highest probabilities of winning
top_4_indices = sorted_indices[:4]
top_4_horses = [input_data[index + 8] for index in top_4_indices]  # Extract horse information from the input data

print("Top 4 horses with the highest probabilities of winning:")
for index, horse_info in zip(top_4_indices, top_4_horses):
    print(f"Horse {index + 1}: {horse_info[2]}")  # Print horse name