## AUSTRALIAN OPEN 2025 PREPOCESSING

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report, confusion_matrix
import sys
from pathlib import Path

utils_path = Path("../0.Utils").resolve()
sys.path.append(str(utils_path))
from utils import COLS_TO_EXCLUDE

In [9]:
df_kaggle = pd.read_csv("../../Data/2025/atp_tennis_2025.csv")

df_kaggle["Date"] = pd.to_datetime(df_kaggle["Date"], errors='coerce')
df_2025 = df_kaggle[df_kaggle["Date"].dt.year == 2025].copy()

df_aus_open_2025 = df_2025[df_2025["Tournament"] == "Australian Open"]

print(f"{len(df_aus_open_2025)} matches found for AUS OPEN 2025")
df_aus_open_2025.head()

116 matches found for AUS OPEN 2025


Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
64271,Australian Open,2025-01-12,Grand Slam,Outdoor,Hard,1st Round,5,Jasika O.,Gaston H.,Gaston H.,179,81,323,703,3.2,1.36,2-6 6-3 2-6 2-6
64272,Australian Open,2025-01-12,Grand Slam,Outdoor,Hard,1st Round,5,Fils A.,Virtanen O.,Fils A.,21,93,2280,627,1.13,6.0,3-6 7-6 6-4 6-4
64273,Australian Open,2025-01-12,Grand Slam,Outdoor,Hard,1st Round,5,Monteiro T.,Nishikori K.,Nishikori K.,105,76,566,743,4.5,1.2,6-4 7-6 5-7 2-6 3-6
64274,Australian Open,2025-01-12,Grand Slam,Outdoor,Hard,1st Round,5,Ruud C.,Munar J.,Ruud C.,6,62,4210,922,1.17,5.0,6-3 1-6 7-5 2-6 6-1
64275,Australian Open,2025-01-12,Grand Slam,Outdoor,Hard,1st Round,5,Bu Y.,Habib H.,Habib H.,67,219,784,264,1.36,3.2,6-7 4-6 6-7


In [13]:
csv_path = Path('../../Datasets/aus_open_2025.csv')
df_aus_open_2025.to_csv(csv_path, index=False)

---

## Names fitting

In [29]:
import pandas as pd
import json
from pathlib import Path

def name_label_to_id(label: str, csv_path: Path) -> int | None:
    """
    Convert a label "Last F." (e.g., "Monfils G.") to an ATP player ID.
    If no candidate or multiple candidates are found, return None and print warnings.
    """
    parts = label.strip().split()
    if len(parts) < 2:
        raise ValueError(f"Invalid label format: {label!r}")
    last_name = parts[0]
    first_initial = parts[1].replace('.', '')[0].lower()

    # Read ATP players CSV with low_memory=False to suppress dtype warnings
    players = pd.read_csv(
        csv_path,
        names=["id", "first", "last", "hand", "dob", "ioc", "height", "wikidata"],
        header=None,
        low_memory=False
    )

    # 1) Filter by last name
    mask_last = players["last"].str.lower().str.strip() == last_name.lower().strip()
    candidates_last = players[mask_last]
    if candidates_last.empty:
        print(f"WARNING: No players found with last name '{last_name}' for label '{label}'.")
        return None

    # 2) Filter by first name initial (na=False handles missing values)
    mask_first = candidates_last["first"].str.lower().str.startswith(first_initial, na=False)
    candidates = candidates_last[mask_first]
    if candidates.empty:
        print(f"WARNING: No players with initial '{first_initial.upper()}' for label '{label}'.")
        return None

    # 3) Handle multiple matches
    if len(candidates) > 1:
        print(f"WARNING: Multiple candidates for label '{label}':")
        print(candidates[["id", "first", "last"]].to_string(index=False))
        return None

    # 4) Exactly one match
    return int(candidates.iloc[0]["id"])

# Paths (ensure these are correct on your filesystem)
aus_csv = Path('../../Datasets/aus_open_2025.csv')
atp_csv = Path('../../Data/players/atp_players.csv')
output_json = Path('../../Datasets/aus_open_2025_matches.json')

# Load and filter Australian Open data
df = pd.read_csv(aus_csv)
df = df[df['Tournament'] == 'Australian Open']

# Prepare JSON structure
tournament = {
    "tournament": "Australian Open 2025",
    "surface":    df['Surface'].iloc[0].upper(),
    "start_date": df['Date'].min(),
    "matches":    []
}

# Build each match entry
match_id = 1
for _, row in df.iterrows():
    p1_label, p2_label = row['Player_1'], row['Player_2']
    id1 = name_label_to_id(p1_label, atp_csv)
    id2 = name_label_to_id(p2_label, atp_csv)

    # Determine outcome
    winner = row.get('Winner')
    outcome = None
    if winner == p1_label:
        outcome = "player1"
    elif winner == p2_label:
        outcome = "player2"

    tournament["matches"].append({
        "round":    row['Round'],
        "match_id": match_id,
        "player1":  {"name": p1_label, "id": id1},
        "player2":  {"name": p2_label, "id": id2},
        "outcome":  outcome
    })
    match_id += 1

# Write JSON to disk
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(tournament, f, ensure_ascii=False, indent=2)

print(f"JSON file generated at: {output_json}")

    id     first last
102104 Christian Ruud
134770    Casper Ruud
    id  first    last
209261 Matteo Gigante
209263 Matteo Gigante
    id first   last
210084 Jakub Mensik
210150 Jakub Mensik
    id     first   last
124186 Alexandre Muller
142378    Arthur Muller
    id  first      last
211776 Martin Landaluce
212021 Martin Landaluce
    id first   last
147511   Jim Mccabe
210317 James Mccabe
212324  John Mccabe
    id    first     last
107270    James Thompson
111388    Jason Thompson
111442   Jordan Thompson
111683    Jamin Thompson
128290 Jonathan Thompson
132208     John Thompson
147490    Jerry Thompson
    id      first   last
117272 Jose Maria Draper
145281        J K Draper
207733       Jack Draper
    id     first     last
106415 Yoshihito Nishioka
132399     Yasuo Nishioka
    id    first  last
127760    Lukas Klein
140284  Laurent Klein
147680 Ladislav Klein
    id first last
104652  Todd Paul
126205 Tommy Paul
    id             first  last
105585                Ze Zhang
11

---

## Testing IDs

In [33]:
import json
from pathlib import Path

# Path to the JSON file
json_path = Path('../../Datasets/aus_open_2025_matches_all_ids.json')
# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    tournament = json.load(f)

# Find and list all null IDs
null_entries = []
for match in tournament.get("matches", []):
    mid = match.get("match_id")
    p1 = match["player1"]
    p2 = match["player2"]
    if p1.get("id") is None:
        null_entries.append((mid, "player1", p1.get("name")))
    if p2.get("id") is None:
        null_entries.append((mid, "player2", p2.get("name")))

# Count and display
total_nulls = len(null_entries)
print(f"Total null IDs: {total_nulls}")
print("List of null ID entries (match_id, player_role, player_name):")
for match_id, role, name in null_entries:
    print(f"  Match {match_id}: {role} → {name}")


Total null IDs: 0
List of null ID entries (match_id, player_role, player_name):
