In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

In [10]:
season_code = "25_26"
current_dir = Path.cwd() 
path_folder = current_dir.parent.parent / "csv" / f"csv{season_code}" / "teams"
leagues_folder = "Teams"

In [11]:
def add_derived_columns(df):
    df = df.copy()

    df["Total Aerial Duels"] = (
        df.get("Aerial Duels Won", 0) + df.get("Aerial Duels Lost", 0)
    )

    df["Ball Losses"] = (
        df.get("Dispossessed", 0) + df.get("Miscontrols", 0)
    )
    
    df["Progressive Actions (Total)"] = (
        df.get("Progressive Passes", 0)
        + df.get("Progressive Runs", 0)
        + df.get("Progressive Carries", 0)
        + df.get("Progressive Passes Received", 0)
    )
    
    df["Actions created"] = (
        df.get("Shot Creating Actions", 0)
        + df.get("Goal Creating Actions", 0)
    )
    
    df["Actions in the Penalty Area"] = (
        df.get("Crosses into Penalty Area", 0)
        + df.get("Passes into Penalty Area", 0)
        + df.get("Touches Attacking Penalty Area", 0)
        + df.get("Carries into Penalty Area", 0)
    )
    
    df["Total Duels won"] = (
        df.get("Tackles Won", 0)
        + df.get("Challenges Tackled", 0)
    )
    
    df["Penaltys Winner"] = (
        df.get("Penalty Kicks Missed", 0) 
        + df.get("Penalty Kicks Saved", 0)
    )
    
    return df

In [12]:
def add_parameters(df):
    df = df.copy()
    
    df["Efficiency"] = (
        (df.get("Goals", 0) - df.get("xG", 0))/df.get("xG", 0)
    )
    
    df["% Aerial Duels"] = (
        100 * df.get("Aerial Duels Won", 0) / df.get("Total Aerial Duels", 0)
    )
    
    df["% Passes (Total)"] = (
        100 * df.get("Passes Completed (Total)", 0) / df.get("Passes Attempted (Total)", 0)
    )
    
    df["% Passes (Short)"] = (
        100 * df.get("Passes Completed (Short)", 0) / df.get("Passes Attempted (Short)", 0)
    )
    
    df["% Passes (Medium)"] = (
        100 * df.get("Passes Completed (Medium)", 0) / df.get("Passes Attempted (Medium)", 0)
    )
    
    df["% Passes (Long)"] = (
        100 * df.get("Passes Completed (Long)", 0) / df.get("Passes Attempted (Long)", 0)
    )
    
    df["% Tackles/Duels"] = (
        100 * (df.get("Tackles Won", 0) + df.get("Challenges Tackled", 0))/ (df.get("Tackles", 0) + df.get("Challenges Attempted", 0))
    )
    
    df["% Take-Ons"] = (
        100 * df.get("Successful Take-Ons", 0) / df.get("Take-Ons Attempted", 0)
    )
    
    df["Efficiency GK"] = (
        (df.get("Post-Shot Expected Goals", 0) - df.get("Goals Against", 0))/ df.get("Post-Shot Expected Goals", 0)
    )
     
    df["% Saves"] = (
        100 * df.get("Saves", 0) / df.get("Shots on Target Against", 0)
    )
    
    df["% Long Passes GK"] = (
        100 * df.get("Launched Passes Completed", 0) / df.get("Launched Passes Attempted", 0)
    )
    
    df["% Crosses Stopped"] = (
        100 * df.get("Crosses Stopped", 0) / df.get("Crosses Opposed", 0)
    )
    
    return df

In [13]:
def aggregated_data(data):
    df = add_derived_columns(data)
    static_cols = ['Average Age']

    numeric_cols = [col for col in df.columns if col not in static_cols and pd.api.types.is_numeric_dtype(df[col])]

    df_aggregated = df.groupby('Team', as_index=False).agg({
        **{col: 'first' for col in static_cols if col in df.columns},
        **{col: 'sum' for col in numeric_cols}
    })

    if 'Possession' in df.columns and 'Matches Played' in df.columns:
        possession_weighted = (
            df[['Team', 'Possession', 'Matches Played']]
            .assign(Possession_x_MP = df['Possession'] * df['Matches Played'])
            .groupby('Team', as_index=False)
            .agg({
                'Possession_x_MP': 'sum',
                'Matches Played': 'sum'
            })
            .assign(Possession = lambda d: d['Possession_x_MP'] / d['Matches Played'])
            [['Team', 'Possession']]
        )

        df_aggregated = df_aggregated.drop(columns='Possession', errors='ignore')
        df_aggregated = df_aggregated.merge(possession_weighted, on='Team', how='left')

    return df_aggregated


In [14]:
def adjusted_data(df_aggregated):
    df_aggregated = df_aggregated.copy()
    
    matches = df_aggregated['Matches Played'].replace(0, np.nan)  

    exclude_cols = ['Matches Played', 'Average Age', 'Possession']
    
    per90_cols = [col for col in df_aggregated.columns if col not in exclude_cols and pd.api.types.is_numeric_dtype(df_aggregated[col])]
    
    for col in per90_cols:
        df_aggregated[col] = (df_aggregated[col] / matches).round(2)
    
    df_aggregated['Matches Played'] = matches.fillna(0).round(0).astype(int)
    df_adjusted = add_parameters(df_aggregated).round(2)
    df_adjusted = df_adjusted.fillna(0)

    return df_adjusted

In [15]:
def centiles_data(df_adjusted):
    df_adjusted = df_adjusted.copy()
    
    static_cols = ['Matches Played', 'Average Age']
    
    stat_cols = [col for col in df_adjusted.columns if col not in static_cols and pd.api.types.is_numeric_dtype(df_adjusted[col])]
    
    negative_stats = [
        'Yellow Cards', 'Red Cards', 'Second Yellow Cards',
        'Fouls Committed', 'Offsides', 'Miscontrols', 'Dispossessed',
        'Errors', 'Own Goals', 'Penalties Conceded', 'Challenges Lost',
        'Ball Losses', 'Aerial Duels Lost', 'Goals Against', 'Clean Sheets'
    ]
    
    for col in negative_stats:
        if col in stat_cols:
            df_adjusted[col] = - df_adjusted[col]
    
    df_centiles = df_adjusted.copy()
    df_centiles[stat_cols] = df_adjusted[stat_cols].transform(lambda x: x.rank(pct=True) * 100)
    
    df_centiles[stat_cols] = df_centiles[stat_cols].round(2)
    
    return df_centiles

In [16]:
path = path_folder / f"{leagues_folder}.csv"
path_end_aggregated_data = path_folder / f"{leagues_folder}_aggregated.csv"
path_end_adjusted_data   = path_folder / f"{leagues_folder}_adjusted.csv"
path_end_centiles        = path_folder / f"{leagues_folder}_centiles.csv"

data = pd.read_csv(path)
df_aggregated = aggregated_data(data)
df_adjusted = adjusted_data(df_aggregated)
df_centiles = centiles_data(df_adjusted)
df_aggregated.to_csv(path_end_aggregated_data, index=False)
df_adjusted.to_csv(path_end_adjusted_data, index=False)
df_centiles.to_csv(path_end_centiles, index=False)