## AUSTRALIAN OPEN 2025 - PREDICTION

In [26]:
import json
import pandas as pd
from pathlib import Path
import sys
from importlib.machinery import SourceFileLoader

# === Load utils.py dynamically ===
utils_file = Path("../0.Utils/utils.py").resolve()
utils = SourceFileLoader("utils", str(utils_file)).load_module()

In [27]:
# 1) Configuration
JSON_PATH    = Path('../../Datasets/aus_open_2025_matches_all_ids.json')
PARQUET_PATH = Path('../../Datasets/final_tennis_dataset_symmetric.parquet')
MODEL_PATH   = Path('../../Models/xgb_model.json')
CUTOFF_DATE  = '2025-01-01'

# 2) Load tournament structure
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    tournament = json.load(f)
surface = tournament['surface']

# 3) Load pre-match features and model
global_df, surface_dfs = utils.get_latest_features_by_surface(PARQUET_PATH, CUTOFF_DATE)
model = utils.load_trained_model(MODEL_PATH)

# 4) Match-by-match evaluation
total_evaluated = 0
total_correct   = 0
accuracy_by_round = {}

for match in tournament['matches']:
    m_id    = match['match_id']
    rnd     = match['round']
    p1_id   = match['player1']['id']
    p2_id   = match['player2']['id']
    p1_name = match['player1']['name']
    p2_name = match['player2']['name']
    actual  = match['outcome']

    # a) Skip if IDs or real outcome are missing
    if p1_id is None or p2_id is None:
        print(f"Skipping {m_id} ({rnd}): missing ID — {p1_name}={p1_id}, {p2_name}={p2_id}")
        continue
    if actual is None:
        print(f"Skipping {m_id} ({rnd}): actual outcome missing")
        continue

    # b) Try to build features and predict
    try:
        prob = utils.predict_match(p1_id, p2_id, surface, model, global_df, surface_dfs)
    except KeyError as e:
        print(f"Skipping {m_id} ({rnd}): missing features — {e}")
        continue

    pred = 'player1' if prob >= 0.5 else 'player2'
    is_correct = (pred == actual)

    # c) Record the result
    total_evaluated += 1
    total_correct   += is_correct
    stats = accuracy_by_round.setdefault(rnd, {'total': 0, 'correct': 0})
    stats['total']   += 1
    stats['correct'] += is_correct

# 5) Reporting
if total_evaluated:
    overall_acc = total_correct / total_evaluated
    print(f"\nEvaluated {total_evaluated} matches; overall accuracy: {overall_acc:.2%}\n")
else:
    print("\nNo matches could be evaluated.\n")

print("Accuracy by round:")
for rnd, stats in accuracy_by_round.items():
    acc = stats['correct'] / stats['total']
    print(f"  {rnd}: {acc:.2%} ({stats['correct']}/{stats['total']})")

Skipping 9 (1st Round): missing features — 'Player 206904 not found for surface HARD'
Skipping 19 (1st Round): missing features — 'Player 211776 not found for surface HARD'
Skipping 24 (1st Round): missing features — 'Player 113451 not found for surface HARD'
Skipping 27 (1st Round): missing features — 'Player 132399 not found for surface HARD'
Skipping 51 (1st Round): missing features — 'Player 207729 not found for surface HARD'
Skipping 55 (1st Round): missing features — 'Player 120759 not found for surface HARD'
Skipping 61 (2nd Round): missing features — 'Player 132399 not found for surface HARD'
Skipping 70 (2nd Round): missing features — 'Player 113451 not found for surface HARD'
Skipping 80 (2nd Round): missing features — 'Player 207729 not found for surface HARD'
Skipping 96 (3rd Round): missing features — 'Player 113451 not found for surface HARD'

Evaluated 107 matches; overall accuracy: 67.29%

Accuracy by round:
  1st Round: 66.04% (35/53)
  2nd Round: 59.26% (16/27)
  3rd 

In [28]:
# Rounds to display
detail_rounds = ['Semifinals', 'The Final']

# Pour chaque tour, on filtre et on affiche
for rnd in detail_rounds:
    df_r = df_results[df_results['round'] == rnd]
    if df_r.empty:
        print(f"No matches evaluated for {rnd}\n")
        continue

    print(f"=== {rnd} ===")
    print(df_r[['match_id','player1','player2','prob_p1','pred','actual','correct']]
          .to_string(index=False))
    print()

=== Semifinals ===
 match_id     player1    player2  prob_p1    pred  actual  correct
      115  Shelton B.  Sinner J. 0.287103 player2 player2        1
      116 Djokovic N. Shelton B. 0.528279 player1 player2        0

=== The Final ===
 match_id   player1   player2  prob_p1    pred  actual  correct
      117 Sinner J. Zverev A. 0.762118 player1 player1        1



---

## Montecarlo simulation

In [29]:
import json
import random
from pathlib import Path
from importlib.machinery import SourceFileLoader

# 1) Load your utils module
utils_file = Path("../0.Utils/utils.py").resolve()
utils = SourceFileLoader("utils", str(utils_file)).load_module()

# 2) Configuration
JSON_PATH    = Path('../../Datasets/aus_open_2025_matches_all_ids.json')
PARQUET_PATH = Path('../../Datasets/final_tennis_dataset_symmetric.parquet')
MODEL_PATH   = Path('../../Models/xgb_model.json')
CUTOFF_DATE  = '2025-01-01'
MC_RUNS      = 30  # number of Monte Carlo tournament simulations

# 4) Load the tournament draw from JSON
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    tournament = json.load(f)
surface = tournament['surface']

# 5) Build an ID-to-name map for readable output
id_to_name = {}
for match in tournament['matches']:
    for side in ("player1", "player2"):
        pid = match[side]["id"]
        name = match[side]["name"]
        if pid is not None:
            id_to_name[pid] = name

# 6) Load pre-match feature snapshots and the trained model
global_df, surface_dfs = utils.get_latest_features_by_surface(PARQUET_PATH, CUTOFF_DATE)
model = utils.load_trained_model(MODEL_PATH)

# 7) Prepare the first-round bracket as a list of (player1_id, player2_id)
first_round = sorted(
    [m for m in tournament['matches'] if m['round'] == '1st Round'],
    key=lambda m: m['match_id']
)
bracket_init = [(m['player1']['id'], m['player2']['id']) for m in first_round]

# 8) Single-tournament simulation, returning champion, finalists, and final win probability
def simulate_tournament_once():
    pairs = list(bracket_init)  # copy initial bracket
    rounds = [
        '1st Round','2nd Round','3rd Round','4th Round',
        'Quarterfinals','Semifinals','The Final'
    ]
    # simulate all rounds up to the semifinal
    for rnd in rounds[:-1]:  # skip final
        winners = []
        for p1, p2 in pairs:
            if p1 is None:
                winners.append(p2); continue
            if p2 is None:
                winners.append(p1); continue
            try:
                prob_p1 = utils.predict_match(p1, p2, surface, model, global_df, surface_dfs)
                winner = p1 if random.random() < prob_p1 else p2
            except KeyError as e:
                msg = str(e)
                if f"Player {p1}" in msg:
                    winner = p2
                elif f"Player {p2}" in msg:
                    winner = p1
                else:
                    winner = p2
            winners.append(winner)
        # pair winners for next round
        pairs = [(winners[i], winners[i+1] if i+1 < len(winners) else None)
                 for i in range(0, len(winners), 2)]

    # now pairs contains exactly one pair for the Final
    p1, p2 = pairs[0]
    # record finalists
    finalists = (p1, p2)
    # determine final win probability and winner
    if p1 is None:
        final_winner = p2
        final_prob = 1.0
    elif p2 is None:
        final_winner = p1
        final_prob = 1.0
    else:
        prob_p1 = utils.predict_match(p1, p2, surface, model, global_df, surface_dfs)
        # use the predicted probability as final_prob for the actual winner
        if random.random() < prob_p1:
            final_winner = p1
            final_prob = prob_p1
        else:
            final_winner = p2
            final_prob = 1 - prob_p1

    return final_winner, finalists, final_prob

# 9) Run Monte Carlo: track champions and final probabilities
champion_counts = {}
final_probs = {}  # maps champion_id -> list of their final match win probabilities

for i in range(1, MC_RUNS + 1):
    champion, (f1, f2), prob = simulate_tournament_once()
    champion_counts[champion] = champion_counts.get(champion, 0) + 1
    final_probs.setdefault(champion, []).append(prob)
    name_champ = id_to_name.get(champion, champion)
    name_f1 = id_to_name.get(f1, f1)
    name_f2 = id_to_name.get(f2, f2)
    print(f"Simulation {i}: Finalists = {name_f1} vs {name_f2}, "
          f"Winner = {name_champ} (win prob {prob:.2%})")

# 10) Display aggregated results
print(f"\nAfter {MC_RUNS} simulations, estimated champion probabilities:")
for pid, count in sorted(champion_counts.items(), key=lambda x: -x[1]):
    name = id_to_name.get(pid, pid)
    probability = count / MC_RUNS
    avg_final_prob = sum(final_probs[pid]) / len(final_probs[pid])
    print(f"{name}: {probability:.2%} as champion, "
          f"average final win prob {avg_final_prob:.2%}")

Simulation 1: Finalists = Alcaraz C. vs Norrie C., Winner = Alcaraz C. (win prob 73.71%)
Simulation 2: Finalists = Zverev A. vs Diallo G., Winner = Zverev A. (win prob 60.29%)
Simulation 3: Finalists = Sinner J. vs Shelton B., Winner = Shelton B. (win prob 28.97%)
Simulation 4: Finalists = Michelsen A. vs Cobolli F., Winner = Cobolli F. (win prob 46.49%)
Simulation 5: Finalists = Zverev A. vs Shelton B., Winner = Zverev A. (win prob 49.21%)
Simulation 6: Finalists = Sinner J. vs Etcheverry T., Winner = Etcheverry T. (win prob 20.20%)
Simulation 7: Finalists = Alcaraz C. vs Krueger M., Winner = Krueger M. (win prob 19.85%)
Simulation 8: Finalists = Mensik J. vs Popyrin A., Winner = Popyrin A. (win prob 43.77%)
Simulation 9: Finalists = Mensik J. vs Bublik A., Winner = Bublik A. (win prob 33.99%)
Simulation 10: Finalists = Fils A. vs Rublev A., Winner = Rublev A. (win prob 51.82%)
Simulation 11: Finalists = Kokkinakis T. vs Carreno Busta P., Winner = Kokkinakis T. (win prob 56.35%)
Simul