## Preprocessing RG 2025 PLAYERS and IDs

In [12]:
import pandas as pd
import json
import unicodedata

# 🔹 Fonction pour normaliser les noms (retirer accents, upper, strip)
def normalize_name(name):
    if not isinstance(name, str):
        return ""
    name = unicodedata.normalize("NFKD", name).encode("ASCII", "ignore").decode("utf-8")
    return name.upper().strip()

# 🔹 Charger le JSON des matchs
with open("./roland_garros_2025_round1.json", "r") as f:
    data = json.load(f)

# 🔹 Extraire et normaliser les noms des joueurs
matches = data["matches"]
player_names = set()
for match in matches:
    player_names.add(normalize_name(match["player1"]["name"]))
    player_names.add(normalize_name(match["player2"]["name"]))

players_df = pd.DataFrame(sorted(player_names), columns=["name"])

# 🔹 Charger les données ATP
atp_players = pd.read_csv("../../../Data/players/atp_players.csv")
atp_players["name_first"] = atp_players["name_first"].fillna("")
atp_players["name_last"] = atp_players["name_last"].fillna("")
atp_players["full_name"] = (atp_players["name_last"] + " " + atp_players["name_first"]).apply(normalize_name)

# 🔹 Jointure avec les données ATP
merged_df = players_df.merge(
    atp_players[["player_id", "full_name"]],
    left_on="name",
    right_on="full_name",
    how="left"
)

merged_df = merged_df[["name", "player_id"]].rename(columns={"player_id": "id"})

# 🔹 Ajout des ID manquants
manual_ids = {
    "AUGER-ALIASSIME FELIX": 200000,
    "BU YUNCHAOKETE": 207352,
    "HERBERT PIERRE-HUGUES": 105732,
    "O'CONNELL CHRISTOPHER": 106331,
    "GOMEZ FEDERICO": 666,
    "RAMOS-VINOLAS ALBERT": 105077,
    "STRUFF JAN-LENNARD": 105526,
}
merged_df["id"] = merged_df.apply(
    lambda row: manual_ids[row["name"]] if pd.isna(row["id"]) and row["name"] in manual_ids else row["id"],
    axis=1
)

# 🔹 Conversion en type entier (nullable)
merged_df["id"] = merged_df["id"].astype("Int64")

# 🔹 Résultat final
print("\n🎾 DataFrame final avec noms et IDs :")
print(merged_df.to_string(index=False))

# 🔹 Statistiques utiles
print("\n📊 Vérification finale :")
print(f"- Total joueurs       : {len(merged_df)}")
print(f"- Joueurs avec ID     : {merged_df['id'].notna().sum()}")
print(f"- Joueurs sans ID     : {merged_df['id'].isna().sum()}")



🎾 DataFrame final avec noms et IDs :
                       name     id
             ALCARAZ CARLOS 207989
            ALTMAIER DANIEL 127157
             ARNALDI MATTEO 208286
             ATMANE TERENCE 209279
      AUGER-ALIASSIME FELIX 200000
             BAEZ SEBASTIAN 202104
       BASILASHVILI NIKOLOZ 105932
      BAUTISTA AGUT ROBERTO 105138
            BELLUCCI MATTIA 208233
                BERGS ZIZOU 200267
               BLANCHET UGO 200259
             BONZI BENJAMIN 126127
                BORGES NUNO 132686
            BROOKSBY JENSON 202385
             BU YUNCHAOKETE 207352
           BUBLIK ALEXANDER 122330
        CARRENO BUSTA PABLO 105807
              CAZAUX ARTHUR 209070
        CERUNDOLO FRANCISCO 202103
      CERUNDOLO JUAN MANUEL 207678
                CILIC MARIN 105227
             COBOLLI FLAVIO 207925
         COMESANA FRANCISCO 207681
            DARDERI LUCIANO 209260
DAVIDOVICH FOKINA ALEJANDRO 200221
             DE JONG JESPER 207411
             DE M

  atp_players = pd.read_csv("../../../Data/players/atp_players.csv")


In [14]:
import pandas as pd
import json
import unicodedata

# Function to normalize names: remove accents, convert to uppercase, and strip whitespace
def normalize_name(name):
    if not isinstance(name, str):
        return ""
    name = unicodedata.normalize("NFKD", name).encode("ASCII", "ignore").decode("utf-8")
    return name.upper().strip()

# Load JSON file containing the match data
with open("./roland_garros_2025_round1.json", "r") as f:
    data = json.load(f)
matches = data["matches"]

# Extract player names from all matches and normalize them
player_names = set()
for match in matches:
    player_names.add(normalize_name(match["player1"]["name"]))
    player_names.add(normalize_name(match["player2"]["name"]))

# Create a DataFrame from the normalized player names
players_df = pd.DataFrame(sorted(player_names), columns=["name"])

# Load the ATP players file and prepare a normalized full name column
atp_players = pd.read_csv("../../../Data/players/atp_players.csv")
atp_players["name_first"] = atp_players["name_first"].fillna("")
atp_players["name_last"] = atp_players["name_last"].fillna("")
atp_players["full_name"] = (atp_players["name_last"] + " " + atp_players["name_first"]).apply(normalize_name)

# Merge the player names with ATP player IDs based on normalized full names
merged_df = players_df.merge(
    atp_players[["player_id", "name_first", "name_last", "full_name"]],
    left_on="name",
    right_on="full_name",
    how="left"
).drop(columns=["full_name"]).rename(columns={"player_id": "id"})

# Manually assign missing IDs for players not matched from the ATP file
manual_ids = {
    "AUGER-ALIASSIME FELIX": (200000, "FELIX", "AUGER-ALIASSIME"),
    "BU YUNCHAOKETE": (207352, "YUNCHAOKETE", "BU"),
    "HERBERT PIERRE-HUGUES": (105732, "PIERRE-HUGUES", "HERBERT"),
    "O'CONNELL CHRISTOPHER": (106331, "CHRISTOPHER", "O'CONNELL"),
    "GOMEZ FEDERICO": (666, "FEDERICO", "GOMEZ"),
    "RAMOS-VINOLAS ALBERT": (105077, "ALBERT", "RAMOS-VINOLAS"),
    "STRUFF JAN-LENNARD": (105526, "JAN-LENNARD", "STRUFF"),
}

# Apply manual corrections to the DataFrame
for name, (id_value, first, last) in manual_ids.items():
    idx = merged_df["name"] == name
    merged_df.loc[idx, "id"] = id_value
    merged_df.loc[idx, "name_first"] = first
    merged_df.loc[idx, "name_last"] = last

# Final cleanup: set proper column names and types
merged_df["id"] = merged_df["id"].astype("Int64")
merged_df = merged_df[["name_last", "name_first", "id"]].rename(columns={
    "name_last": "last_name",
    "name_first": "first_name"
})

# Export the cleaned player data to a CSV file
output_path = "./PLAYERS_ID.csv"
merged_df.to_csv(output_path, index=False)
print(f"File successfully saved to: {output_path}")


File successfully saved to: ./PLAYERS_ID.csv


  atp_players = pd.read_csv("../../../Data/players/atp_players.csv")
