## Rolland Garros Prediction

In [8]:
import json
import pandas as pd
from pathlib import Path
import sys
from importlib.machinery import SourceFileLoader

# === CONFIGURATION ===
HERE = Path.cwd()
PROJECT_ROOT = HERE.parents[2]

JSON_PATH    = PROJECT_ROOT / "Code" / "4.Prediction" / "RG_2025" / "roland_garros_2025_complete_final.json"
PARQUET_PATH = PROJECT_ROOT / "Datasets" / "final_tennis_dataset_symmetric.parquet"
MODEL_PATH   = PROJECT_ROOT / "Models" / "xgb_model.json"
CUTOFF_DATE  = '2025-05-01'

# === LOAD UTILS ===
utils_folder = PROJECT_ROOT / "Code" / "0.Utils"
sys.path.insert(0, str(utils_folder))
import utils

# === LOAD DATA ===
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    tournament = json.load(f)
surface = tournament['surface']

global_df, surface_dfs = utils.get_latest_features_by_surface(PARQUET_PATH, CUTOFF_DATE)
model = utils.load_trained_model(MODEL_PATH)

# === EVALUATE MATCHES ===
records = []
accuracy_by_round = {}

for match in tournament['matches']:
    match_id = match['match_id']
    round_name = match.get('round', 'Unknown')
    p1_id = match['player1']['id']
    p2_id = match['player2']['id']
    p1_name = match['player1']['name']
    p2_name = match['player2']['name']
    actual_outcome = match['outcome']

    if p1_id is None or p2_id is None or actual_outcome is None:
        print(f"Skipping match {match_id} due to missing id or outcome")
        continue

    try:
        prob_p1 = utils.predict_match(p1_id, p2_id, surface, model, global_df, surface_dfs)
        predicted = 'player1' if prob_p1 >= 0.5 else 'player2'
    except KeyError as e:
        print(f"[Warning] Match {match_id}: fallback to actual winner due to KeyError: {e}")
        prob_p1 = None
        predicted = actual_outcome  # assume model would get it right

    is_correct = int(predicted == actual_outcome)

    records.append({
        'match_id': match_id,
        'round': round_name,
        'player1': p1_name,
        'player2': p2_name,
        'prob_p1': prob_p1,
        'pred': predicted,
        'actual': actual_outcome,
        'correct': is_correct
    })

    stats = accuracy_by_round.setdefault(round_name, {'total': 0, 'correct': 0})
    stats['total'] += 1
    stats['correct'] += is_correct

# === REPORT ===
df_results = pd.DataFrame(records)

if not df_results.empty:
    overall_accuracy = df_results['correct'].mean()
    print(f"\nRoland-Garros 2025 – Evaluated {len(df_results)} matches; overall accuracy: {overall_accuracy:.2%}\n")
else:
    print("\nNo matches could be evaluated.\n")

print("Accuracy by round:")
for round_name, stats in accuracy_by_round.items():
    accuracy = stats['correct'] / stats['total']
    print(f"  {round_name}: {accuracy:.2%} ({stats['correct']}/{stats['total']})")

# === DETAIL: from Quarterfinals on ===
print("\nDetailed predictions from Quarterfinals onwards:")
important_rounds = {"Quarterfinal", "Semifinal", "Final"}

for _, row in df_results.iterrows():
    if row['round'] in important_rounds:
        print(f"\nMatch ID {row['match_id']} – {row['round']}")
        print(f"  {row['player1']} vs {row['player2']}")
        print(f"  Predicted winner: {row['player1'] if row['pred']=='player1' else row['player2']}")
        print(f"  Actual winner:    {row['player1'] if row['actual']=='player1' else row['player2']}")
        print(f"  Correct: {bool(row['correct'])}")
        if row['prob_p1'] is not None:
            print(f"  Prob Player 1 wins: {row['prob_p1']:.2%}")


Roland-Garros 2025 – Evaluated 127 matches; overall accuracy: 73.23%

Accuracy by round:
  1st Round: 75.00% (48/64)
  2nd Round: 59.38% (19/32)
  3rd Round: 87.50% (14/16)
  4th Round: 75.00% (6/8)
  Quarterfinal: 100.00% (4/4)
  Semifinal: 50.00% (1/2)
  Final: 100.00% (1/1)

Detailed predictions from Quarterfinals onwards:

Match ID 121 – Quarterfinal
  SINNER Jannik vs BUBLIK Alexander
  Predicted winner: SINNER Jannik
  Actual winner:    SINNER Jannik
  Correct: True
  Prob Player 1 wins: 61.04%

Match ID 122 – Quarterfinal
  ZVEREV Alexander vs DJOKOVIC Novak
  Predicted winner: DJOKOVIC Novak
  Actual winner:    DJOKOVIC Novak
  Correct: True
  Prob Player 1 wins: 39.11%

Match ID 123 – Quarterfinal
  MUSETTI Lorenzo vs TIAFOE Frances
  Predicted winner: TIAFOE Frances
  Actual winner:    TIAFOE Frances
  Correct: True
  Prob Player 1 wins: 43.99%

Match ID 124 – Quarterfinal
  PAUL Tommy vs ALCARAZ Carlos
  Predicted winner: ALCARAZ Carlos
  Actual winner:    ALCARAZ Carlos
  