## Rolland Garros Prediction

In [2]:
import json
import pandas as pd
from pathlib import Path
import sys
from importlib.machinery import SourceFileLoader

# === CONFIGURATION ===
HERE = Path.cwd()
PROJECT_ROOT = HERE.parents[2]

JSON_PATH    = PROJECT_ROOT / "Code" / "4.Prediction" / "RG_2025" / "roland_garros_2025_complete_final.json"
PARQUET_PATH = PROJECT_ROOT / "Datasets" / "final_tennis_dataset_symmetric.parquet"
MODEL_PATH   = PROJECT_ROOT / "Models"   / "xgb_model.json"
CUTOFF_DATE  = '2025-05-01'

# === LOAD UTILS ===
utils_folder = PROJECT_ROOT / "Code" / "0.Utils"
sys.path.insert(0, str(utils_folder))
import utils

# === LOAD DATA ===
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    tournament = json.load(f)
surface = tournament['surface']

global_df, surface_dfs = utils.get_latest_features_by_surface(PARQUET_PATH, CUTOFF_DATE)
model = utils.load_trained_model(MODEL_PATH)

# === EVALUATE MATCHES ===
records = []
accuracy_by_round = {}

for match in tournament['matches']:
    match_id = match['match_id']
    round_name = match.get('round', 'Unknown')
    p1_id = match['player1']['id']
    p2_id = match['player2']['id']
    p1_name = match['player1']['name']
    p2_name = match['player2']['name']
    actual_outcome = match['outcome']

    if p1_id is None or p2_id is None or actual_outcome is None:
        continue

    try:
        prob_p1 = utils.predict_match(p1_id, p2_id, surface, model, global_df, surface_dfs)
    except KeyError:
        continue

    predicted = 'player1' if prob_p1 >= 0.5 else 'player2'
    is_correct = int(predicted == actual_outcome)

    records.append({
        'match_id': match_id,
        'round': round_name,
        'player1': p1_name,
        'player2': p2_name,
        'prob_p1': prob_p1,
        'pred': predicted,
        'actual': actual_outcome,
        'correct': is_correct
    })

    stats = accuracy_by_round.setdefault(round_name, {'total': 0, 'correct': 0})
    stats['total'] += 1
    stats['correct'] += is_correct

# === REPORT ===
df_results = pd.DataFrame(records)

if not df_results.empty:
    overall_accuracy = df_results['correct'].mean()
    print(f"\nRoland-Garros 2025 – Evaluated {len(df_results)} matches; overall accuracy: {overall_accuracy:.2%}\n")
else:
    print("\nNo matches could be evaluated.\n")

print("Accuracy by round:")
for round_name, stats in accuracy_by_round.items():
    accuracy = stats['correct'] / stats['total']
    print(f"  {round_name}: {accuracy:.2%} ({stats['correct']}/{stats['total']})")



Roland-Garros 2025 – Evaluated 108 matches; overall accuracy: 71.30%

Accuracy by round:
  1st Round: 73.77% (45/61)
  2nd Round: 58.06% (18/31)
  3rd Round: 87.50% (14/16)
