In [3]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [4]:
season_code = "25_26"
current_dir = Path.cwd()  
path_folder = current_dir.parent.parent.parent.parent / "csv" / f"csv{season_code}" / "players" / "clean"
path_folder_end = current_dir.parent.parent.parent.parent / "csv" / f"csv{season_code}" / "players" / "centiles"

In [5]:
def custom_agg(df):
    if "Game Week" in df.columns:
        df.drop("Game Week", axis=1, inplace=True)
    
    df_grouped = df.groupby(["Player", "Team"], as_index=False).agg({
        "League": lambda x: sorted(list(set(x.dropna()))),
        "Position": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
        "General Position": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
        "Age": "max",
        "Nationality": 'first',
        **{
            col: 'sum'
            for col in df.select_dtypes(include='number').columns
            if col != "Age"
        }
    })

    df_grouped = df_grouped.round(2)
    df_grouped["Age"] = df_grouped["Age"].astype("Int64")
    df_grouped["Minutes"] = df_grouped["Minutes"].astype("Int64") 
    
    return df_grouped

In [6]:
def merge_similar_players(df, max_age_diff=1):
    df = df.sort_values(by=["Player", "Age"]).reset_index(drop=True)
    merged_rows = []
    used_indices = set()

    for i, row_i in df.iterrows():
        if i in used_indices:
            continue

        mask = (
            (df["Player"] == row_i["Player"]) &
            (df["Nationality"] == row_i["Nationality"]) &
            (abs(df["Age"] - row_i["Age"]) <= max_age_diff)
        )
        close_rows = df[mask]
        indices = close_rows.index.tolist()
        used_indices.update(indices)

        subset = df.loc[indices]

        new_row = {}
        new_row["Player"] = row_i["Player"]
        new_row["Team"] = ', '.join(sorted(set(subset["Team"].dropna())))
        leagues = set()
        for league_entry in subset["League"]:
            if isinstance(league_entry, list):
                leagues.update(league_entry)
            elif isinstance(league_entry, str) and league_entry.startswith("["):
                try:
                    import ast
                    leagues.update(ast.literal_eval(league_entry))
                except:
                    leagues.add(league_entry)
            else:
                leagues.add(league_entry)
        new_row["League"] = ', '.join(sorted(leagues))

        for col in ["Position", "General Position", "Nationality"]:
            if col in subset.columns and not subset[col].mode().empty:
                new_row[col] = subset[col].mode().iloc[0]
            else:
                new_row[col] = None

        new_row["Age"] = int(subset["Age"].max())

        for col in df.columns:
            if col in new_row:
                continue
            if pd.api.types.is_numeric_dtype(df[col]):
                new_row[col] = subset[col].sum()
            else:
                new_row[col] = subset[col].iloc[0]

        merged_rows.append(pd.Series(new_row))

    df_merged = pd.DataFrame(merged_rows)
    df_merged = df_merged.round(2)

    if "Minutes" in df_merged.columns:
        df_merged["Minutes"] = df_merged["Minutes"].astype("Int64")
    if "Age" in df_merged.columns:
        df_merged["Age"] = df_merged["Age"].astype("Int64")

    return df_merged


In [7]:
os.makedirs(path_folder_end, exist_ok=True)

for filename in os.listdir(path_folder):
    if filename.endswith(".csv"):
        path_file = os.path.join(path_folder, filename)

        df = pd.read_csv(path_file)

        if filename == "data_teams.csv":
            continue 
        
        df_grouped = custom_agg(df)
        df_final = merge_similar_players(df_grouped)

        output_path = os.path.join(path_folder_end, filename)
        df_final.to_csv(output_path, index=False)

        print(f"Fichier traité et sauvegardé : {filename}")

Fichier traité et sauvegardé : data_goals.csv
Fichier traité et sauvegardé : data_players.csv
