## AUSTRALIAN OPEN 2025 - PREDICTION

In [10]:
import json
import pandas as pd
from pathlib import Path
import sys
from importlib.machinery import SourceFileLoader

utils_folder = Path.cwd().parents[1] / "0.Utils"
sys.path.insert(0, str(utils_folder))
import utils
from utils import *

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report, confusion_matrix

In [9]:
# In your AUS_Open_Prediction.ipynb, at the very top cell:

import sys
from pathlib import Path

# 1) Compute the absolute path to Code/0.Utils
#    Path.cwd() is …/Code/3.Prediction/AUS_OPEN_2025
#    .parents[2] steps up to …/Code
utils_folder = Path.cwd().parents[1] / "0.Utils"

print("→ Inserting into sys.path:", utils_folder)
sys.path.insert(0, str(utils_folder))

# 2) Now import normally
import utils

# 3) (Optional) inspect that you got the right module:
print("utils.py location:", utils.__file__)
print("Available functions:", [fn for fn in dir(utils) if not fn.startswith("_")])


→ Inserting into sys.path: /home/cytech/Desktop/Data Analytics with HPC/Code/0.Utils
utils.py location: /home/cytech/Desktop/Data Analytics with HPC/Code/0.Utils/utils.py
Available functions: ['COLS_TO_EXCLUDE', 'Path', 'SURFACES', 'build_match_row', 'get_latest_features_by_player', 'get_latest_features_by_surface', 'get_player_stats', 'load_trained_model', 'np', 'pd', 'player_name_to_id', 'predict_match', 'random', 're', 'run_monte_carlo', 'xgb']


In [11]:
import shutil
import subprocess

# Vérifie que la commande nvidia-smi est disponible
if shutil.which("nvidia-smi") is None:
    print("nvidia-smi non trouvé : pas de GPU NVIDIA détectée ou pilote non installé.")
else:
    try:
        # Récupère la liste des GPU
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], 
            stderr=subprocess.DEVNULL
        )
        gpus = output.decode().strip().split("\n")
        print(f"{len(gpus)} GPU(s) détectée(s) :")
        for i, name in enumerate(gpus):
            print(f"  GPU {i} : {name}")
    except subprocess.CalledProcessError:
        print("Erreur lors de l'appel à nvidia-smi.")


nvidia-smi non trouvé : pas de GPU NVIDIA détectée ou pilote non installé.


In [16]:
import json
import pandas as pd
from pathlib import Path
import utils  # ensure utils.py is on your PYTHONPATH

# 1) Configuration: define paths and cutoff date
HERE = Path.cwd()
PROJECT_ROOT = HERE.parents[2]   

JSON_PATH    = PROJECT_ROOT / "Datasets" / "aus_open_2025_matches_all_ids.json"
PARQUET_PATH = PROJECT_ROOT / "Datasets" / "final_tennis_dataset_symmetric.parquet"
MODEL_PATH   = PROJECT_ROOT / "Models"   / "xgb_model.json"
CUTOFF_DATE  = '2025-01-01'

# 2) Load the tournament draw from JSON
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    tournament = json.load(f)
surface = tournament['surface']

# 3) Load the pre-match feature snapshots and the trained model
global_df, surface_dfs = utils.get_latest_features_by_surface(PARQUET_PATH, CUTOFF_DATE)
model = utils.load_trained_model(MODEL_PATH)

# 4) Evaluate each match and store the results
records = []              # will hold dicts of match_id, round, players, predictions, etc.
accuracy_by_round = {}    # dictionary to accumulate total and correct counts per round

for match in tournament['matches']:
    match_id = match['match_id']
    round_name = match['round']
    p1_id = match['player1']['id']
    p2_id = match['player2']['id']
    p1_name = match['player1']['name']
    p2_name = match['player2']['name']
    actual_outcome = match['outcome']

    # a) Skip matches with missing player IDs or actual outcome
    if p1_id is None or p2_id is None or actual_outcome is None:
        continue

    # b) Attempt to build features and predict; skip on missing features
    try:
        prob_p1 = utils.predict_match(p1_id, p2_id, surface, model, global_df, surface_dfs)
    except KeyError:
        continue

    # Determine model's predicted winner
    predicted = 'player1' if prob_p1 >= 0.5 else 'player2'
    is_correct = int(predicted == actual_outcome)

    # Record the result for this match
    records.append({
        'match_id': match_id,
        'round': round_name,
        'player1': p1_name,
        'player2': p2_name,
        'prob_p1': prob_p1,
        'pred': predicted,
        'actual': actual_outcome,
        'correct': is_correct
    })

    # Accumulate counts for accuracy by round
    stats = accuracy_by_round.setdefault(round_name, {'total': 0, 'correct': 0})
    stats['total'] += 1
    stats['correct'] += is_correct

# 5) Build a DataFrame from the collected records
df_results = pd.DataFrame(records)

# 6) Report overall and per-round accuracy
total_matches = len(df_results)
if total_matches:
    overall_accuracy = df_results['correct'].mean()
    print(f"\nEvaluated {total_matches} matches; overall accuracy: {overall_accuracy:.2%}\n")
else:
    print("\nNo matches could be evaluated.\n")

print("Accuracy by round:")
for round_name, stats in accuracy_by_round.items():
    accuracy = stats['correct'] / stats['total']
    print(f"  {round_name}: {accuracy:.2%} ({stats['correct']}/{stats['total']})")


Evaluated 107 matches; overall accuracy: 70.09%

Accuracy by round:
  1st Round: 69.81% (37/53)
  2nd Round: 62.96% (17/27)
  3rd Round: 78.57% (11/14)
  4th Round: 83.33% (5/6)
  Quarterfinals: 75.00% (3/4)
  Semifinals: 50.00% (1/2)
  The Final: 100.00% (1/1)


## Details for Semifinals and Grand Final

In [17]:
detail_rounds = ['Quarterfinals', 'Semifinals', 'The Final']

for round_name in detail_rounds:
    df_round = df_results[df_results['round'] == round_name]
    if df_round.empty:
        print(f"\nNo matches evaluated for {round_name}\n")
    else:
        print(f"\n=== {round_name} ===")
        print(df_round[['match_id','player1','player2','prob_p1','pred','actual','correct']]
              .to_string(index=False))


=== Quarterfinals ===
 match_id     player1      player2  prob_p1    pred  actual  correct
      111     Paul T.    Zverev A. 0.383772 player2 player2        1
      112 Djokovic N.   Alcaraz C. 0.467081 player2 player1        0
      113   Sonego L.   Shelton B. 0.311989 player2 player2        1
      114   Sinner J. De Minaur A. 0.714063 player1 player1        1

=== Semifinals ===
 match_id     player1   player2  prob_p1    pred  actual  correct
      115  Shelton B. Sinner J. 0.327199 player2 player2        1
      116 Djokovic N. Zverev A. 0.563557 player1 player2        0

=== The Final ===
 match_id   player1   player2  prob_p1    pred  actual  correct
      117 Sinner J. Zverev A. 0.742085 player1 player1        1


---

## Montecarlo simulation

In [None]:
import json
import random
from pathlib import Path
from importlib.machinery import SourceFileLoader


# 2) Configuration
HERE = Path.cwd()
PROJECT_ROOT = HERE.parents[2]    # e.g. …/Code → …/ (project root)

JSON_PATH    = PROJECT_ROOT / "Datasets" / "aus_open_2025_matches_all_ids.json"
PARQUET_PATH = PROJECT_ROOT / "Datasets" / "final_tennis_dataset_symmetric.parquet"
MODEL_PATH   = PROJECT_ROOT / "Models"   / "xgb_model.json"
CUTOFF_DATE  = '2025-01-01'

MC_RUNS      = 50  # number of Monte Carlo tournament simulations

# 4) Load the tournament draw from JSON
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    tournament = json.load(f)
surface = tournament['surface']

# 5) Build an ID-to-name map for readable output
id_to_name = {}
for match in tournament['matches']:
    for side in ("player1", "player2"):
        pid = match[side]["id"]
        name = match[side]["name"]
        if pid is not None:
            id_to_name[pid] = name

# 6) Load pre-match feature snapshots and the trained model
global_df, surface_dfs = utils.get_latest_features_by_surface(PARQUET_PATH, CUTOFF_DATE)
model = utils.load_trained_model(MODEL_PATH)

# 7) Prepare the first-round bracket as a list of (player1_id, player2_id)
first_round = sorted(
    [m for m in tournament['matches'] if m['round'] == '1st Round'],
    key=lambda m: m['match_id']
)
bracket_init = [(m['player1']['id'], m['player2']['id']) for m in first_round]

# 8) Single-tournament simulation, returning champion, finalists, and final win probability
def simulate_tournament_once():
    pairs = list(bracket_init)  # copy initial bracket
    rounds = [
        '1st Round','2nd Round','3rd Round','4th Round',
        'Quarterfinals','Semifinals','The Final'
    ]
    # simulate all rounds up to the semifinal
    for rnd in rounds[:-1]:  # skip final
        winners = []
        for p1, p2 in pairs:
            if p1 is None:
                winners.append(p2); continue
            if p2 is None:
                winners.append(p1); continue
            try:
                prob_p1 = utils.predict_match(p1, p2, surface, model, global_df, surface_dfs)
                winner = p1 if random.random() < prob_p1 else p2
            except KeyError as e:
                msg = str(e)
                if f"Player {p1}" in msg:
                    winner = p2
                elif f"Player {p2}" in msg:
                    winner = p1
                else:
                    winner = p2
            winners.append(winner)
        # pair winners for next round
        pairs = [(winners[i], winners[i+1] if i+1 < len(winners) else None)
                 for i in range(0, len(winners), 2)]

    # now pairs contains exactly one pair for the Final
    p1, p2 = pairs[0]
    # record finalists
    finalists = (p1, p2)
    # determine final win probability and winner
    if p1 is None:
        final_winner = p2
        final_prob = 1.0
    elif p2 is None:
        final_winner = p1
        final_prob = 1.0
    else:
        prob_p1 = utils.predict_match(p1, p2, surface, model, global_df, surface_dfs)
        # use the predicted probability as final_prob for the actual winner
        if random.random() < prob_p1:
            final_winner = p1
            final_prob = prob_p1
        else:
            final_winner = p2
            final_prob = 1 - prob_p1

    return final_winner, finalists, final_prob

# 9) Run Monte Carlo: track champions and final probabilities
champion_counts = {}
final_probs = {}  # maps champion_id -> list of their final match win probabilities

for i in range(1, MC_RUNS + 1):
    champion, (f1, f2), prob = simulate_tournament_once()
    champion_counts[champion] = champion_counts.get(champion, 0) + 1
    final_probs.setdefault(champion, []).append(prob)
    name_champ = id_to_name.get(champion, champion)
    name_f1 = id_to_name.get(f1, f1)
    name_f2 = id_to_name.get(f2, f2)
    print(f"Simulation {i}: Finalists = {name_f1} vs {name_f2}, "
          f"Winner = {name_champ} (win prob {prob:.2%})")

# 10) Display aggregated results
print(f"\nAfter {MC_RUNS} simulations, estimated champion probabilities:")
for pid, count in sorted(champion_counts.items(), key=lambda x: -x[1]):
    name = id_to_name.get(pid, pid)
    probability = count / MC_RUNS
    avg_final_prob = sum(final_probs[pid]) / len(final_probs[pid])
    print(f"{name}: {probability:.2%} as champion, "
          f"average final win prob {avg_final_prob:.2%}")

Simulation 1: Finalists = Mensik J. vs Nakashima B., Winner = Mensik J. (win prob 52.91%)
Simulation 2: Finalists = Sinner J. vs Rublev A., Winner = Sinner J. (win prob 70.21%)
Simulation 3: Finalists = Sinner J. vs Monfils G., Winner = Sinner J. (win prob 80.06%)
Simulation 4: Finalists = Fils A. vs Norrie C., Winner = Fils A. (win prob 52.94%)
Simulation 5: Finalists = Alcaraz C. vs Musetti L., Winner = Musetti L. (win prob 29.99%)
Simulation 6: Finalists = Sinner J. vs Shapovalov D., Winner = Sinner J. (win prob 79.89%)
Simulation 7: Finalists = Auger-Aliassime F. vs Sonego L., Winner = Auger-Aliassime F. (win prob 58.09%)
Simulation 8: Finalists = Lehecka J. vs Popyrin A., Winner = Popyrin A. (win prob 52.83%)
Simulation 9: Finalists = Sinner J. vs Rublev A., Winner = Sinner J. (win prob 70.21%)
Simulation 10: Finalists = Auger-Aliassime F. vs Rublev A., Winner = Auger-Aliassime F. (win prob 40.29%)
Simulation 11: Finalists = Korda S. vs Popyrin A., Winner = Popyrin A. (win prob 46

## PARALELIZATION