In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time


In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

date = "2025/04/20"
course = "ST"
max_races = 10

race_pages = []

for race in range(1, max_races + 1):
    attempt = 1
    max_attempts = 3
    success = False
    timeouts = [10, 20, 30]

    while attempt <= max_attempts and not success:
        url = f"https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={course}&RaceNo={race}"
        try:
            print(f"🔄 Attempt {attempt} for Race {race} (timeout: {timeouts[attempt - 1]}s)...")
            response = requests.get(url, headers=headers, timeout=timeouts[attempt - 1])
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "lxml")
                race_pages.append(soup)
                print(f"✅ Race {race} loaded successfully.")
                success = True
            else:
                print(f"❌ Race {race} returned status {response.status_code}")
        except Exception as e:
            print(f"⚠️  Race {race} failed on attempt {attempt}: {e}")
        
        if not success:
            attempt += 1
            time.sleep(3)  # Small pause between retries

    if not success:
        print(f"❌❌ Race {race} failed after {max_attempts} attempts. Exiting loop.")
        break  # Hard fail — don’t continue if any race is missed

print(f"\n✅ Loaded {len(race_pages)} races before exit.")

In [None]:
import re

def parse_race_details(soup):
    race_tab = soup.find("div", class_="race_tab")
    if not race_tab:
        print("❌ Could not find race_tab div")
        return {}

    rows = race_tab.find_all("tr")

    try:
        # --- Row 1 ---
        race_number_text = rows[0].find("td").text.strip()
        race_number = int(re.search(r"RACE\s+(\d+)", race_number_text).group(1))

        # --- Row 2 ---
        class_distance_rating = rows[2].find_all("td")[0].text.strip()
        going = rows[2].find_all("td")[2].text.strip()

        # Parse using regex
        rating_part = "Unrestricted"  # default
        match = re.match(r"(Class \d+)\s*-\s*(\d+)M(?:\s*-\s*\((.+?)\))?", class_distance_rating)

        if not match:
            raise ValueError(f"Unable to parse class/distance/rating: '{class_distance_rating}'")

        class_part = match.group(1)
        distance_part = int(match.group(2))
        if match.group(3):  # Only if rating is present
            rating_part = match.group(3)

        # --- Row 3 ---
        race_name = rows[3].find_all("td")[0].text.strip()
        course_config = rows[3].find_all("td")[2].text.strip()

        # --- Row 4 ---
        prize_raw = rows[4].find_all("td")[0].text.strip()
        prize_clean = int(re.sub(r"[^\d]", "", prize_raw))  # remove HK$ and commas

        return {
            "race_number": race_number,
            "race_class": class_part,
            "distance_m": distance_part,
            "rating_bracket": rating_part,
            "going": going,
            "race_name": race_name,
            "course_config": course_config,
            "prize_hkd": prize_clean
        }

    except Exception as e:
        print(f"❌ Failed to parse race details: {e}")
        return {}



race_details_list = []

for soup in race_pages:
    details = parse_race_details(soup)
    if details:
        race_details_list.append(details)

race_details_df = pd.DataFrame(race_details_list)
race_details_df

In [None]:
import pandas as pd

def safe_int(text):
    try:
        return int(text.strip())
    except:
        return None

def safe_float(text):
    try:
        return float(text.strip())
    except:
        return None

def parse_runners_table(soup):
    performance_div = soup.find("div", class_="performance")
    if not performance_div:
        print("❌ Could not find performance div")
        return pd.DataFrame()

    rows = performance_div.find("tbody").find_all("tr")
    data = []

    for row in rows:
        cells = row.find_all("td")

        # Skip if no win odds (i.e., didn't start)
        win_odds_raw = cells[11].text.strip()
        if win_odds_raw == "" or win_odds_raw in {"---", "-"}:
            continue

        try:
            placing = safe_int(cells[0].text)
            horse_no = safe_int(cells[1].text)

            horse_info = cells[2].text.strip()
            horse_name = horse_info.split("(")[0].strip()
            horse_id = horse_info.split("(")[1].replace(")", "").strip() if "(" in horse_info else None

            jockey = cells[3].text.strip()
            trainer = cells[4].text.strip()
            act_weight = safe_int(cells[5].text)
            declared_weight = safe_int(cells[6].text)
            draw = safe_int(cells[7].text)

            # Convert finish time (e.g., 1:39.62) → 99.62 seconds
            finish_time_raw = cells[10].text.strip()
            if ":" in finish_time_raw:
                minutes, seconds = map(float, finish_time_raw.split(":"))
                finish_time_sec = round(minutes * 60 + seconds, 2)
            else:
                finish_time_sec = None  # e.g., DNF

            win_odds = safe_float(win_odds_raw)

            data.append({
                "placing": placing,
                "horse_no": horse_no,
                "horse_name": horse_name,
                "horse_id": horse_id,
                "jockey": jockey,
                "trainer": trainer,
                "actual_weight": act_weight,
                "declared_weight": declared_weight,
                "draw": draw,
                "finish_time_sec": finish_time_sec,
                "win_odds": win_odds
            })

        except Exception as e:
            print(f"⚠️ Failed to parse row: {e}")

    return pd.DataFrame(data)

all_runners = []

for i, soup in enumerate(race_pages):
    if soup is None:
        print(f"⚠️ Race {i + 1} soup is None. Skipping.")
        continue

    try:
        df = parse_runners_table(soup)
        if not df.empty:
            df["race_number"] = i + 1
            all_runners.append(df)
            print(f"✅ Parsed race {i + 1}: {len(df)} runners")
        else:
            print(f"⚠️ Race {i + 1} has empty DataFrame")
    except Exception as e:
        print(f"❌ Error parsing race {i + 1}: {e}")

df_results = pd.concat(all_runners, ignore_index=True)
df_results.head()


In [None]:
import pandas as pd

def parse_place_dividends(soup, race_number):
    div = soup.find("div", class_="dividend_tab")
    if not div:
        print(f"⚠️ Race {race_number}: No dividend_tab found")
        return pd.DataFrame()

    place_data = []
    collecting = False

    for row in div.find_all("tr"):
        cols = row.find_all("td")
        if not cols:
            continue

        # Normalize the text of the first cell
        first = cols[0].text.strip().upper()

        # 1) Start collecting on the row that says "PLACE"
        if first == "PLACE":
            collecting = True
            # That row has 3 cells: [PLACE, horse_no, payout]
            cells = cols[1:]
        # 2) If we're already collecting, check if it's a continuation row (2 cells)
        elif collecting and len(cols) == 2:
            cells = cols
        # 3) If a new pool name appears, stop collecting
        elif first and collecting:
            collecting = False
            continue
        else:
            continue

        # Now parse horse_no(s) and payout from `cells`
        try:
            raw_nos = cells[0].text.strip()
            raw_payout = cells[1].text.strip().replace(",", "")

            # Normalize HK$10 → HK$1 unit
            payout = float(raw_payout) / 10

            # Support ties (e.g. "1,11")
            horse_nos = [int(h) for h in raw_nos.split(",")]

            for hn in horse_nos:
                place_data.append({
                    "horse_no": hn,
                    "place_dividend_hkd": payout,
                    "race_number": race_number
                })
        except Exception as e:
            print(f"⚠️ Race {race_number}: Error parsing PLACE row: {e}")

    return pd.DataFrame(place_data)

all_divs = []
for i, soup in enumerate(race_pages, start=1):
    df_div = parse_place_dividends(soup, race_number=i)
    if not df_div.empty:
        all_divs.append(df_div)
df_dividends = pd.concat(all_divs, ignore_index=True)
df_dividends

#### scraping stewards reports

In [None]:
from datetime import datetime

def parse_stewards_report(date_str: str, retries=3, delay=5) -> pd.DataFrame:
    url = f"https://racing.hkjc.com/racing/information/English/Reports/RaceReportFull.aspx?Date={date_str}"
    attempt = 0

    while attempt < retries:
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            break
        except requests.exceptions.RequestException as e:
            attempt += 1
            print(f"⚠️ Attempt {attempt} failed for {date_str}: {e}")
            if attempt == retries:
                print(f"❌ Failed to load steward report for {date_str} after {retries} attempts.")
                return pd.DataFrame()
            time.sleep(delay)

    soup = BeautifulSoup(resp.content, "lxml")

    # --- Metadata ---
    meta_divs   = soup.find("div", class_="data_meeting").find_all("div", recursive=False)
    meeting_txt = meta_divs[0].get_text(strip=True)
    schedule_txt= meta_divs[1].get_text(strip=True)

    date_part   = meeting_txt.split(":", 1)[1].strip()
    date_only, weekday = date_part.split(" ")
    weekday     = weekday.strip("()")
    n_races     = int(schedule_txt.split()[0])
    start_time  = schedule_txt.split(": ")[1]

    # Convert start_time to 24-hr format
    start_time = datetime.strptime(start_time, "%H:%M").strftime("%H:%M")

    go_block    = soup.find("div", class_="data_go")
    course_link = go_block.find("a", class_="p2").get_text(strip=True)
    course      = course_link.split("-")[0].strip()

    def _extract_two(label: str):
        td_label = go_block.find("td", string=lambda t: t and label in t)
        td_val = td_label.find_next_sibling("td")
        spans = td_val.find_all("span", style=lambda s: s and "#900" in s)
        time_strings = [sib.strip("() ").replace("as of", "").strip() for sib in td_val.stripped_strings if "as of" in sib]

        v1 = float(spans[0].get_text(strip=True))
        t1 = datetime.strptime(time_strings[0], "%I:%M %p").strftime("%H:%M")
        if len(spans) > 1:
            v2 = float(spans[1].get_text(strip=True))
            t2 = datetime.strptime(time_strings[1], "%I:%M %p").strftime("%H:%M")
        else:
            v2, t2 = None, None
        return v1, t1, v2, t2

    pen1, pen_t1, pen2, pen_t2 = _extract_two("Penetrometer Reading")
    try:
        c1, c_t1, c2, c_t2 = _extract_two("Clegg Hammer Reading")
    except AttributeError:
        c1, c_t1, c2, c_t2 = None, None, None, None

    return pd.DataFrame([{
        "meeting_date":              pd.to_datetime(date_only, dayfirst=True).date(),
        "weekday":                   weekday,
        "n_races":                   n_races,
        "start_time":                start_time,
        "course":                    course,
        "penetrometer_reading1":     pen1,
        "penetrometer_time1":        pen_t1,
        "penetrometer_reading2":     pen2,
        "penetrometer_time2":        pen_t2,
        "clegg_reading1":            c1,
        "clegg_time1":               c_t1,
        "clegg_reading2":            c2,
        "clegg_time2":               c_t2
    }])

df_track = parse_stewards_report("2025/04/16")
if not df_track.empty:
    print(df_track)

In [None]:
def parse_incidents(soup, date_str):
    meeting_date = pd.to_datetime(date_str, dayfirst=False).date()
    all_rows = []

    # Step 1: Find all race headers (race titles)
    race_headers = soup.find_all("p", class_="bg_blue")

    for race_header in race_headers:
        spans = race_header.find_all("span")
        if not spans:
            continue

        try:
            race_text = spans[0].text.strip()  # "Race:1 (592)"
            race_number = int(race_text.split(":")[1].split("(")[0].strip())
            race_number_season = int(race_text.split("(")[1].strip(")"))
        except Exception as e:
            print(f"⚠️ Failed to parse race header: {race_text} — {e}")
            continue

        # Step 2: Get the table that immediately follows this header
        incident_table = race_header.find_next("table", class_="rirr")
        if not incident_table:
            continue

        tbody = incident_table.find("tbody")
        rows = tbody.find_all("tr")

        for row in rows:
            cols = row.find_all("td")
            if len(cols) < 6:
                continue

            placing = cols[0].text.strip()
            horse_no = cols[1].text.strip()

            horse_info = cols[3].text.strip()
            if "(" in horse_info:
                horse_name = horse_info.split("(")[0].strip()
                horse_id = horse_info.split("(")[1].strip(")")
            else:
                horse_name = horse_info
                horse_id = None

            jockey = cols[4].text.strip()
            incident = cols[5].text.strip()

            all_rows.append({
                "meeting_date": meeting_date,
                "race_number": race_number,
                "race_number_season": race_number_season,
                "placing": placing,
                "horse_no": horse_no,
                "horse_name": horse_name,
                "horse_id": horse_id,
                "jockey": jockey,
                "incident": incident
            })

    return pd.DataFrame(all_rows)

url = "https://racing.hkjc.com/racing/information/English/Reports/RaceReportFull.aspx?Date=2025/04/16"
resp = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(resp.content, "lxml")

df_incidents = parse_incidents(soup, date_str="2025/04/16")
df_incidents

In [None]:
import pandas as pd
from bs4 import BeautifulSoup

def parse_stewards_summary(soup, date_str):
    meeting_date = pd.to_datetime(date_str).date()
    rows = []

    # --- Locate both 'other' divs and pick General vs Summary ---
    other_divs = soup.find_all("div", class_="other")
    general_div = None
    summary_div = None

    for d in other_divs:
        header_p = d.find("p", class_="f_fs16 font_wb")
        if not header_p:
            continue
        label = header_p.get_text(strip=True).lower()
        if label == "general":
            general_div = d
        elif label == "summary":
            summary_div = d

    # --- Parse GENERAL entries (all get section="General") ---
    if general_div:
        # each direct child <div> is one entry
        for entry in general_div.find_all("div", recursive=False):
            heading = entry.find("p", class_="bg_blue")
            content = entry.find("p", class_="f_fs16")
            if not content:
                continue

            # combine heading + content if heading exists
            text = content.get_text(separator="\n", strip=True)
            if heading:
                text = heading.get_text(strip=True) + "\n" + text

            rows.append({
                "meeting_date": meeting_date,
                "section": "General",
                "content": text
            })

    # --- Parse SUMMARY entries (section name from heading) ---
    if summary_div:
        for entry in summary_div.find_all("div", recursive=False):
            heading = entry.find("p", class_="bg_blue")
            content = entry.find("p", class_="f_fs16")
            if not heading or not content:
                continue

            # drop the number prefix ("1. ", "2. ", etc.)
            sec = heading.get_text(strip=True).split(". ", 1)[-1]
            text = content.get_text(separator="\n", strip=True)

            rows.append({
                "meeting_date": meeting_date,
                "section": sec,
                "content": text
            })

    return pd.DataFrame(rows)


In [None]:
url = "https://racing.hkjc.com/racing/information/English/Reports/RaceReportFull.aspx?Date=2025/04/20"
resp = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=10)
soup = BeautifulSoup(resp.content, "lxml")

df_summary = parse_stewards_summary(soup, "2025/04/20")
df_summary


In [None]:
import os
import time
import uuid
import pandas as pd
from google.cloud import bigquery

# Set credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.expanduser("~/gcp/credentials/hk_racing_sa_key.json")

# Load CSV
csv_path = "/Users/yastherramgath/hk-racing-project/notebooks/race_details.csv"
df = pd.read_csv(csv_path)

# Config
PROJECT_ID = "project-benter-428008-b7"
DATASET_ID = "hk_racing_dataset"
TABLE_NAME = "race_details"
table_id = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_NAME}"
client = bigquery.Client(project=PROJECT_ID)

# Job config
job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    autodetect=True,
    source_format=bigquery.SourceFormat.CSV,
)

# Unique job ID (reuse for safety in retry)
job_id = f"upload_{TABLE_NAME}_{uuid.uuid4().hex[:8]}"

# Retry with exponential backoff
for attempt in range(5):
    try:
        print(f"🔁 Attempt {attempt + 1}: Uploading to {table_id} with job ID {job_id}")
        job = client.load_table_from_dataframe(
            df, table_id, job_config=job_config, job_id=job_id
        )
        job.result()  # Wait for the job to complete
        print(f"✅ Success: Uploaded {len(df)} rows to {table_id}")
        break
    except Exception as e:
        print(f"⚠️ Upload failed: {e}")
        wait = 2 ** attempt
        print(f"⏳ Waiting {wait} seconds before retrying...")
        time.sleep(wait)
