## Preprocessing RG 2025 PLAYERS and IDs

In [3]:
import pandas as pd
import json
import unicodedata

# Step 1: Normalize names (remove accents, upper case)
def normalize_name(name):
    if not isinstance(name, str):
        return ""
    name = unicodedata.normalize("NFKD", name).encode("ASCII", "ignore").decode("utf-8")
    return name.upper().strip()

# Step 2: Load match data
with open("./roland_garros_2025_round1.json", "r") as f:
    data = json.load(f)
matches = data["matches"]

# Step 3: Extract all unique player names from matches
player_names = set()
for match in matches:
    player_names.add(normalize_name(match["player1"]["name"]))
    player_names.add(normalize_name(match["player2"]["name"]))
players_df = pd.DataFrame(sorted(player_names), columns=["name"])

# Step 4: Load ATP base
atp_players = pd.read_csv("../../../Data/players/atp_players.csv")
atp_players["name_first"] = atp_players["name_first"].fillna("")
atp_players["name_last"] = atp_players["name_last"].fillna("")
atp_players["full_name"] = (atp_players["name_last"] + " " + atp_players["name_first"])
atp_players["full_name"] = atp_players["full_name"].apply(normalize_name)

# Step 5: Manual players to add (with consistent formatting)
manual_ids = {
    "Auger-Aliassime Felix": (200000, "Felix", "Auger-Aliassime"),
    "Bu Yunchaokete": (207352, "Yunchaokete", "Bu"),
    "Herbert Pierre-Hugues": (105732, "Pierre-Hugues", "Herbert"),
    "O'Connell Christopher": (106331, "Christopher", "O'Connell"),
    "Gomez Federico": (666, "Federico", "Gomez"),
    "Ramos-Vinolas Albert": (105077, "Albert", "Ramos-Vinolas"),
    "Struff Jan-Lennard": (105526, "Jan-Lennard", "Struff"),
}

manual_df = pd.DataFrame([
    {"player_id": id_, "name_first": first, "name_last": last}
    for _, (id_, first, last) in manual_ids.items()
])
manual_df["full_name"] = (manual_df["name_last"] + " " + manual_df["name_first"]).apply(normalize_name)

# Step 6: Combine ATP and manual players
all_players = pd.concat([atp_players[["player_id", "name_first", "name_last", "full_name"]],
                         manual_df[["player_id", "name_first", "name_last", "full_name"]]],
                        ignore_index=True)

# Step 7: Merge with extracted names
merged_df = players_df.merge(
    all_players,
    left_on="name",
    right_on="full_name",
    how="left"
).drop(columns=["full_name"]).rename(columns={"player_id": "id"})

# Step 8: Final cleanup
merged_df["id"] = merged_df["id"].astype("Int64")
merged_df = merged_df[["name_last", "name_first", "id"]].rename(columns={
    "name_last": "last_name",
    "name_first": "first_name"
})

# Step 9: Save
output_path = "./PLAYERS_ID.csv"
merged_df.to_csv(output_path, index=False)
print(f"File successfully saved to: {output_path}")


File successfully saved to: ./PLAYERS_ID.csv


  atp_players = pd.read_csv("../../../Data/players/atp_players.csv")
