In [None]:
!pip install selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import shlex
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from google.colab import drive
import numpy as np
import tensorflow as tf
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import Precision

In [None]:
drive.mount('/content/drive')

In [None]:
class HKJCWebScraper:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--remote-debugging-port=9222")
        self.browser = webdriver.Chrome(options=chrome_options)
        self.base_url = 'https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate='

    def get_dates(self):
        self.browser.get(self.base_url + '2024/05/29')  # Example date to access the page
        try:
            # Wait for the select element to be present
            select_element = WebDriverWait(self.browser, 20).until(
                EC.presence_of_element_located((By.ID, "selectId"))
            )
            options = WebDriverWait(self.browser, 20).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "option"))
            )
            dates = [option.get_attribute("value") for option in options]
            return dates
        except Exception as e:
            print(f"An error occurred while fetching dates: {e}")
            return []

    def scrape_data_for_date(self, date):
        self.browser.get(self.base_url + date)
        time.sleep(3)  # Wait for page to load
        soup = BeautifulSoup(self.browser.page_source, 'html.parser')

        # Extract data from race_tab
        race_tab_data = []
        race_tab = soup.find('table', class_='race_tab')
        if race_tab:
            rows = race_tab.find_all('tr')[1:]  # Skip the header row
            for row in rows:
                race_tab_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # Extract data from performance table
        performance_data = []
        performance_table = soup.find('table', class_='f_tac table_bd draggable')
        if performance_table:
            rows = performance_table.find_all('tr')
            for row in rows:
                performance_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # Extract data from dividend_tab
        # dividend_data = []
        # dividend_table = soup.find('table', class_='table_bd f_tac f_fs13 f_fl')
        # if dividend_table:
        #     rows = dividend_table.find_all('tr')
        #     for row in rows:
        #         dividend_data.append([cell.get_text(strip=True) for cell in row.find_all('td')])

        # return race_tab_data, performance_data, dividend_data
        return race_tab_data, performance_data

    def close(self):
        self.browser.quit()

scraper = HKJCWebScraper()


In [None]:
# all_data = []
# dates = scraper.get_dates()
# for date in dates:
#     print(f"Scraping data for {date}")
#     try:
#         data = scraper.scrape_data_for_date(date)
#         all_data.append({date: data})
#     except Exception as e:
#         print(f"Failed to scrape data for {date}: {e}")

# file_path = '/content/drive/My Drive/horse_data.xlsx'
# all_data.to_excel(file_path, index=False)
def parse_data(all_data):
    data_list = []
    for race in all_data:
        for date, (meta, race_results) in race.items():
            if not race_results:  # 檢查 race_results 是否為空
                print(f"No race results for {date}")
                continue

            columns = race_results[0]
            for result in race_results[1:]:
                if len(result) != len(columns):
                    print(f"Mismatch in columns and data for {date}: {result}")
                    continue

                race_dict = {col: val for col, val in zip(columns, result)}
                race_dict['date'] = date
                data_list.append(race_dict)

    return pd.DataFrame(data_list)

df = parse_data(all_data)

# 保存為Excel文件
file_path = '/content/drive/My Drive/horse_data.xlsx'
df.to_excel(file_path, index=False)



In [None]:
# 讀取數據
file_path = '/content/drive/My Drive/horse_data.xlsx'
df = pd.read_excel(file_path)

# 顯示前幾行數據
# print(df.head())
print(df.columns)

In [None]:
  # Step 3: Filter the data based on '名次'
  def clean_ranking(x):
      if pd.isna(x):
          return np.nan
      if isinstance(x, str) and '平頭馬' in x:
          return int(x.replace('平頭馬', ''))
      try:
          return int(x)
      except ValueError:
          return np.nan

  if '名次' in df.columns:
      df['名次'] = df['名次'].apply(clean_ranking)
  else:
      print("Column '名次' does not exist in the DataFrame.")

  # Step 4: Select relevant columns and drop NaN values
  relevant_columns = ['馬號', '騎師', '練馬師', '實際負磅', '排位體重', '檔位', '名次']
  df = df[relevant_columns].dropna()

  # Step 5: Filter out rows with non-numeric '檔位' values and convert to integers
  df = df[df['檔位'].apply(lambda x: str(x).isdigit())]
  df['檔位'] = df['檔位'].astype(int)

  # Step 6: Filter out strange draw values (e.g., negative values, or values beyond a realistic range)
  df = df[(df['檔位'] > 0) & (df['檔位'] <= 20)]  # Assuming realistic range is 1 to 20

  # Step 7: Encode categorical columns
  label_encoder_jockey = LabelEncoder()
  label_encoder_trainer = LabelEncoder()

  df['騎師'] = label_encoder_jockey.fit_transform(df['騎師'])
  df['練馬師'] = label_encoder_trainer.fit_transform(df['練馬師'])

  df = pd.get_dummies(df, columns=['騎師', '練馬師'])

  for col in df.select_dtypes(include=['bool']).columns:
      df[col] = df[col].astype(int)

  # Step 8: Add a new column 'race_id' based on existing columns
  df['race_id'] = df.index + 1


In [None]:
# 特徵工程
def feature_engineering(df):
    df['排位體重'] = pd.to_numeric(df['排位體重'], errors='coerce')
    df['實際負磅'] = pd.to_numeric(df['實際負磅'], errors='coerce')

    # 新的特徵創建，例如馬匹的平均速度
    df['average_speed'] = df['排位體重'] / df['實際負磅']
    return df

df = feature_engineering(df)

# Define features and target
features = df.drop(columns=['名次', '馬號', 'race_id'])
target = df['名次']

# Normalize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential([
    Dense(512, input_dim=X_train.shape[1], activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Using linear activation for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=2000, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f'Mean Absolute Error: {mae}')

#Step 2: Define a Function to Predict Top 4 Horses

In [None]:
def predict_top_4(model, scaler, label_encoder_jockey, label_encoder_trainer, input_data):
    input_df = pd.DataFrame(input_data, columns=['馬號', '馬名', '檔位', '負磅', '騎師', '練馬師', '馬匹體重'])

    # Preserve '馬號' and '馬名' columns
    horse_ids = input_df['馬號']
    horse_names = input_df['馬名']

    # Encode categorical features
    input_df['騎師'] = label_encoder_jockey.transform(input_df['騎師'])
    input_df['練馬師'] = label_encoder_trainer.transform(input_df['練馬師'])
    input_df = pd.get_dummies(input_df, columns=['騎師', '練馬師'])

    # Reindex to match the training feature set, excluding '馬號' and '馬名'
    feature_columns = [col for col in features.columns if col not in ['馬號', '馬名']]
    input_df = input_df.reindex(columns=feature_columns, fill_value=0)

    # Normalize the input features
    input_scaled = scaler.transform(input_df)

    # Make predictions
    predictions = model.predict(input_scaled)
    input_df['prediction'] = predictions

    # Add back '馬號' and '馬名' columns
    input_df['馬號'] = horse_ids
    input_df['馬名'] = horse_names

    # Get the top 4 horses with the highest probabilities of winning
    top_4_horses = input_df.nlargest(4, 'prediction')
    return top_4_horses[['馬號', '馬名', 'prediction']]



#Input Data then Predict

In [None]:

# Example input data
input_data = [
    [1, '勁無敵', 5, 135, '蔡明紹','羅富全','1039'],
    [2, '同滿滿', 10, 135, '霍宏聲','伍鵬志','1082'],
    [3, '會展二號', 4, 134, '鍾易禮','告東尼','1145'],
    [4, '威武良駒', 7, 133, '布文','姚本輝',''],
    [5,'中華威威',8,132,'黃智弘','徐雨石','1243'],
    [6,'有你有我',12,130, '艾道拿','廖康銘','973'],
    [7,'飛躍精英',3,129, '巴度','賀賢','1088'],
    [8,'佳景臨門',1,128, '艾兆禮','葉楚航','1119'],
    [9,'萬事有',6,124,'希威森','容天鵬','1242'],
    [10,'友誼至佳',9,119,'董明朗','韋達','1182'],
    [11,'月球',11,118,'湯普新','黎昭昇','1154'],
    [12,'錢途光明',2,115,'潘明輝','丁冠豪','1068']
]

top_4_horses = predict_top_4(model, scaler, label_encoder_jockey, label_encoder_trainer, input_data)
print(top_4_horses)