In [None]:
import json
import pandas as pd
from pathlib import Path

# 0) 
# result_root = Path("Code/3.Prediction")
result_root = Path.cwd()

# 1) Discover all result dirs matching your pattern
result_dirs = sorted([d for d in result_root.iterdir()
                      if d.is_dir() and d.name.startswith("mc_aus_results")])

if not result_dirs:
    raise FileNotFoundError(f"No mc_aus_results* directories found under {result_root}")

print("Found result directories:")
for d in result_dirs:
    print(" ", d)

# 2) Load all partial_*.json files
records = []
for d in result_dirs:
    for fn in sorted(d.glob("partial_*.json")):
        data = json.loads(fn.read_text(encoding="utf-8"))
        runs = data.get("runs", 0)
        for stat in data.get("stats", []):
            records.append({
                "id": stat.get("id"),
                "name": stat.get("name"),
                "wins": stat.get("wins", 0),
                "quarterfinals": stat.get("quarterfinals", 0),
                "semifinals": stat.get("semifinals", 0),
                "runs": runs
            })

# 3) Build DataFrame
df = pd.DataFrame(records)
print("\nConstructed DataFrame with columns:\n", df.columns.tolist())
print(f"Total rows: {len(df)}")

if df.empty:
    print("No records found. Check that your partial_*.json files actually contain a 'stats' list.")
else:
    # 4) Aggregate
    agg = (
        df.groupby(["id", "name"], as_index=False)
          .agg({
              "wins": "sum",
              "quarterfinals": "sum",
              "semifinals": "sum",
              "runs": "sum"
          })
    )

    # 5) Compute probabilities
    agg["champion_probability"] = agg["wins"] / agg["runs"]
    agg["qf_rate"] = agg["quarterfinals"] / agg["runs"]
    agg["sf_rate"] = agg["semifinals"] / agg["runs"]

    # 6) Sort
    agg = agg.sort_values("wins", ascending=False)

agg = agg[[
    "name",
    "wins",
    "runs",
    "champion_probability",
    "semifinals",
    "sf_rate",
    "quarterfinals",
    "qf_rate"
]]

# 2) Display the new table
display(agg.style.format({
    "champion_probability": "{:.2%}",
    "sf_rate": "{:.2%}",
    "qf_rate": "{:.2%}"
}))

Found result directories:
  /mnt/netapp2/Store_uni/home/ulc/cursos/curso363/TFM/Data-Analytics-with-HPC/Code/3.Prediction/mc_aus_results
  /mnt/netapp2/Store_uni/home/ulc/cursos/curso363/TFM/Data-Analytics-with-HPC/Code/3.Prediction/mc_aus_results2
  /mnt/netapp2/Store_uni/home/ulc/cursos/curso363/TFM/Data-Analytics-with-HPC/Code/3.Prediction/mc_aus_results3

Constructed DataFrame with columns:
 ['id', 'name', 'wins', 'quarterfinals', 'semifinals', 'runs']
Total rows: 1044


Unnamed: 0,name,wins,runs,champion_probability,semifinals,sf_rate,quarterfinals,qf_rate
66,Sinner J.,843,5000,16.86%,1562,31.24%,2410,48.20%
75,Alcaraz C.,468,5000,9.36%,900,18.00%,1385,27.70%
30,Rublev A.,338,5000,6.76%,1260,25.20%,2047,40.94%
15,Medvedev D.,233,5000,4.66%,753,15.06%,1126,22.52%
56,De Minaur A.,218,5000,4.36%,881,17.62%,1522,30.44%
61,Korda S.,181,5000,3.62%,583,11.66%,1059,21.18%
34,Tiafoe F.,162,4875,3.32%,483,9.91%,716,14.69%
3,Djokovic N.,150,5000,3.00%,378,7.56%,616,12.32%
89,Shelton B.,146,5000,2.92%,518,10.36%,886,17.72%
77,Rune H.,131,5000,2.62%,491,9.82%,767,15.34%


In [5]:
# 3) Verify that sum of all wins == total tournaments run
total_wins = agg["wins"].sum()
print(f"Sum of wins across all players: {total_wins}")

Sum of wins across all players: 5000
