In [1]:
import json
import pandas as pd
from pathlib import Path

# 0) 
# result_root = Path("Code/3.Prediction")
result_root = Path.cwd()

# 1) Discover all result dirs matching your pattern
result_dirs = sorted([d for d in result_root.iterdir()
                      if d.is_dir() and d.name.startswith("mc_rg_results")])

if not result_dirs:
    raise FileNotFoundError(f"No mc_rg_results* directories found under {result_root}")

print("Found result directories:")
for d in result_dirs:
    print(" ", d)

# 2) Load all partial_*.json files
records = []
for d in result_dirs:
    for fn in sorted(d.glob("partial_*.json")):
        data = json.loads(fn.read_text(encoding="utf-8"))
        runs = data.get("runs", 0)
        for stat in data.get("stats", []):
            records.append({
                "id": stat.get("id"),
                "name": stat.get("name"),
                "wins": stat.get("wins", 0),
                "quarterfinals": stat.get("quarterfinals", 0),
                "semifinals": stat.get("semifinals", 0),
                "runs": runs
            })

# 3) Build DataFrame
df = pd.DataFrame(records)
print("\nConstructed DataFrame with columns:\n", df.columns.tolist())
print(f"Total rows: {len(df)}")

if df.empty:
    print("No records found. Check that your partial_*.json files actually contain a 'stats' list.")
else:
    # 4) Aggregate
    agg = (
        df.groupby(["id", "name"], as_index=False)
          .agg({
              "wins": "sum",
              "quarterfinals": "sum",
              "semifinals": "sum",
              "runs": "sum"
          })
    )

    # 5) Compute probabilities
    agg["champion_probability"] = agg["wins"] / agg["runs"]
    agg["qf_rate"] = agg["quarterfinals"] / agg["runs"]
    agg["sf_rate"] = agg["semifinals"] / agg["runs"]

    # 6) Sort
    agg = agg.sort_values("wins", ascending=False)

agg = agg[[
    "name",
    "wins",
    "runs",
    "champion_probability",
    "semifinals",
    "sf_rate",
    "quarterfinals",
    "qf_rate"
]]

# 2) Display the new table
display(agg.style.format({
    "champion_probability": "{:.2%}",
    "sf_rate": "{:.2%}",
    "qf_rate": "{:.2%}"
}))

Found result directories:
  /mnt/netapp2/Store_uni/home/ulc/cursos/curso363/TFM/Data-Analytics-with-HPC/Code/4.Prediction/RG_2025/mc_rg_results

Constructed DataFrame with columns:
 ['id', 'name', 'wins', 'quarterfinals', 'semifinals', 'runs']
Total rows: 86


Unnamed: 0,name,wins,runs,champion_probability,semifinals,sf_rate,quarterfinals,qf_rate
3,DJOKOVIC Novak,16,200,8.00%,35,17.50%,53,26.50%
39,SINNER Jannik,16,200,8.00%,41,20.50%,57,28.50%
46,ALCARAZ Carlos,16,200,8.00%,39,19.50%,59,29.50%
0,ZVEREV Alexander,13,200,6.50%,28,14.00%,47,23.50%
23,TSITSIPAS Stefanos,12,200,6.00%,32,16.00%,41,20.50%
27,RUUD Casper,11,200,5.50%,25,12.50%,40,20.00%
47,RUNE Holger,7,200,3.50%,22,11.00%,39,19.50%
18,RUBLEV Andrey,7,200,3.50%,27,13.50%,41,20.50%
43,MUSETTI Lorenzo,6,200,3.00%,22,11.00%,34,17.00%
34,DE MINAUR Alex,6,200,3.00%,16,8.00%,31,15.50%


In [2]:
# 3) Verify that sum of all wins == total tournaments run
total_wins = agg["wins"].sum()
print(f"Sum of wins across all players: {total_wins}")

Sum of wins across all players: 200
