In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
season_code = "25_26"
current_dir = Path.cwd()  
path_folder = current_dir.parent.parent.parent.parent / "csv" / f"csv{season_code}" / "players" / "centiles"

In [3]:
def add_derived_columns(df):
    df = df.copy()
    
    return df

In [4]:
def add_parameters(df):
    df = df.copy()
      
    df["% Saves"] = (
        100 * df.get("Saves", 0) / df.get("Shots on Target Against", 1)
    )
    
    df["% Long Passes"] = (
        100 * df.get("Completed Long Passes", 0) / df.get("Attempted Long Passes", 1)
    )
    
    df["% Crosses Stopped"] = (
        100 * df.get("Crosses Stopped", 0) / df.get("Crosses Faced", 1)
    )
    
    df["PSxG/Save"] = (
        df.get("% Saves") * (df.get("Post-Shot Expected Goals (PSxG)", 0))/(df.get("Shots on Target Against", 1))
        #(df.get("Post-Shot Expected Goals (PSxG)", 0) - df.get("Goals Against"))/(df.get("Saves", 1))
    )
    return df

In [5]:
def aggregated_data(data):
    df = add_derived_columns(data)
    
    static_cols = ['Nationality', 'Team', 'League', 'Position', 'General Position', 'Age']
    
    numeric_cols = [col for col in df.columns if col not in static_cols and pd.api.types.is_numeric_dtype(df[col])]

    df['Matches'] = 1  
    df_aggregated = df.groupby(['Player', 'Age'], as_index=False).agg({
        **{col: 'first' for col in static_cols if col in df.columns},
        **{col: 'sum' for col in numeric_cols},
        'Matches': 'count'
    })

    return df_aggregated

In [6]:
def adjusted_data(df_aggregated):
    df_aggregated = df_aggregated.copy()
    
    minutes = df_aggregated['Minutes'].replace(0, np.nan)  

    exclude_cols = ['Nationality', 'Team', 'League', 'Position', 'General Position', 'Age', 'Minutes', 'Matches']
    
    per90_cols = [col for col in df_aggregated.columns if col not in exclude_cols and pd.api.types.is_numeric_dtype(df_aggregated[col])]
    
    for col in per90_cols:
        df_aggregated[col] = (df_aggregated[col] * 90 / minutes).round(2)
    
    df_aggregated['Minutes'] = minutes.fillna(0).round(0).astype(int)
    df_aggregated.fillna(0, inplace=True)
    df_adjusted = add_parameters(df_aggregated).round(2)

    return df_adjusted

In [7]:
def centiles_data(df_aggregated):
    df_aggregated = df_aggregated.copy()
    
    static_cols = ['Nationality', 'Team', 'League', 'Position', 'General Position', 'Age', 'Minutes', 'Matches']
    
    stat_cols = [col for col in df_aggregated.columns if col not in static_cols and pd.api.types.is_numeric_dtype(df_aggregated[col])]
    
    negative_stats = [
        'Goals Against'
        ]
    
    for col in negative_stats:
        if col in stat_cols:
            df_aggregated[col] = -df_aggregated[col]
    
    df_centiles = df_aggregated.copy()
    df_centiles[stat_cols] = df_aggregated[stat_cols].transform(lambda x: x.rank(pct=True) * 100)
    
    df_centiles[stat_cols] = (df_centiles[stat_cols].replace([np.inf, -np.inf], 0).fillna(0).astype(int))
    
    return df_centiles

In [8]:
os.makedirs(path_folder, exist_ok=True)

for filename in os.listdir(path_folder):
    if filename.endswith("data_goals.csv"):
        path_file = os.path.join(path_folder, filename)
        
        path_end_aggregated_data = path_folder / "data_goals_aggregated.csv"
        path_end_adjusted_data   = path_folder / "data_goals_adjusted.csv"
        path_end_centiles        = path_folder / "data_goals_centiles.csv"

        data = pd.read_csv(path_file)
        df_aggregated = aggregated_data(data)
        df_adjusted = adjusted_data(df_aggregated)
        df_centiles = centiles_data(df_adjusted)

        df_aggregated.to_csv(path_end_aggregated_data, index=False)
        df_adjusted.to_csv(path_end_adjusted_data, index=False)
        df_centiles.to_csv(path_end_centiles, index=False)

        print(f"Fichier traité et sauvegardé : {filename}")

Fichier traité et sauvegardé : data_goals.csv
