In [13]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [14]:
season_code = "25_26"
current_dir = Path.cwd()  
path_folder = current_dir.parent.parent.parent.parent 
path_start = os.path.join(path_folder, f"analysis/csv/csv{season_code}/raw data/data_players.csv")
path_end = os.path.join(path_folder, f"csv/csv{season_code}/players/clean/data_players.csv")

In [15]:
def clean_age(age_str):
    if isinstance(age_str, str) and "-" in age_str:
        try:
            return int(age_str.split("-")[0])
        except ValueError:
            return None
    elif isinstance(age_str, (int, float)):
        try:
            return int(age_str)
        except ValueError:
            return None
    return None

In [16]:
def clean_nationality(nat):
    try:
        return str(nat).split()[-1].upper()
    except:
        return np.nan

In [17]:
df_old = pd.read_csv(path_start, index_col=False) 

In [18]:
mask = (df_old["Player"] == "Vitinha") & (df_old["Team"] == "Genoa")
df_old.loc[mask, "Player"] = "Vítor Vitinha"

mask = (df_old["Player"] == "Vitinha") & (df_old["Team"] == "Marseille")
df_old.loc[mask, "Player"] = "Vítor Vitinha"

mask = (df_old["Player"] == "Emiliano Martínez") & (df_old["Nationality"] == "uy URU")
df_old.loc[mask, "Player"] = "Emiliano Martínez (URU)"

mask = (df_old["Player"] == "Nicolás González") & (df_old["Team"] == "Manchester City")
df_old.loc[mask, "Player"] = "Nico González"

In [19]:
def new_position(position_bis):
    position = position_bis.split(",")[0]
    
    if position == 'FW':
        position = 'AM'
    if position == 'MF':
        position = 'CM'
        
    if position in ['AM', 'LW', 'RW', 'LM', 'RM', 'CM', 'DM', 'WB', 'LB', 'RB', 'CB', 'GK']:
        return position
    else:
        print(f"Erreur: '{position_bis}")

In [20]:
def gen_position(position):
    names_map = {
        'Goalkeeper': ['GK'],
        'Defender': ['LB', 'RB', 'CB'],
        'Midfielder': ['LM', 'RM', 'CM', 'DM', 'WB'],
        'Forward': ['AM', 'LW', 'RW']
    }
    
    for category, positions in names_map.items():
        if position in positions:
            return category
        
    return ''

In [21]:
new_data = pd.DataFrame({
    "Player": df_old["Player"],
    "Game Week": df_old["Game Week"],
    "Team": df_old["Team"],
    "League": df_old["League"],
    "Nationality": df_old["Nationality"],
    "Position": df_old["Position"],
    "Age": df_old["Age"],
    "Minutes": df_old["Minutes Played"],
    
    "Goals": df_old["Goals"],
    "Assists": df_old["Assists"],
    "Shots Total": df_old["Shots Total"],
    "Shots on Target": df_old["Shots on Target"],
    "Expected Goals (xG)": df_old["Expected Goals (xG)"],
    "Non-Penalty Expected Goals (npxG)": df_old["Non-Penalty Expected Goals (npxG)"],
    "Penalty Kicks Made": df_old["Penalty Kicks Made"],
    "Penalty Kicks Attempted": df_old["Penalty Kicks Attempted"],
    "Shot-Creating Actions (SCA)": df_old["Shot-Creating Actions (SCA)"],
    "Goal-Creating Actions (GCA)": df_old["Goal-Creating Actions (GCA)"],
    "Key Passes": df_old["Key Passes"],
    "Passes into Final Third": df_old["Passes into Final Third"],
    "Passes into Penalty Area": df_old["Passes into Penalty Area"],
    "Crosses into Penalty Area": df_old["Crosses into Penalty Area"],
    "Crosses": df_old["Crosses"],

    "Expected Assists (xA)": df_old["Expected Assists (xA)"],
    "Expected Assisted Goals (xAG)": df_old["Expected Assisted Goals (xAG)"],

    "Passes Completed": df_old["Passes Completed"],
    "Passes Attempted": df_old["Passes Attempted"],
    "Progressive Passes": df_old["Progressive Passes"],
    "Through Balls": df_old["Through Balls"],
    "Switches": df_old["Switches"],
    "Passes Blocked": df_old["Passes Blocked"],
    "Passes Offside": df_old["Passes Offside"],
    "Live-ball Passes": df_old["Live-ball Passes"],
    "Dead-ball Passes": df_old["Dead-ball Passes"],
    "Passes from Free Kicks": df_old["Passes from Free Kicks"],

    "Passes Completed (Short)": df_old["Passes Completed (Short)"],
    "Passes Attempted (Short)": df_old["Passes Attempted (Short)"],
    "Passes Completed (Medium)": df_old["Passes Completed (Medium)"],
    "Passes Attempted (Medium)": df_old["Passes Attempted (Medium)"],
    "Passes Completed (Long)": df_old["Passes Completed (Long)"],
    "Passes Attempted (Long)": df_old["Passes Attempted (Long)"],

    "Passing Distance (Total)": df_old["Total Passing Distance"],
    "Progressive Passing Distance (Total)": df_old["Progressive Passing Distance"],

    "Carries": df_old["Carries"],
    "Progressive Carries": df_old["Progressive Carries"],
    "Carrying Distance (Total)": df_old["Total Carrying Distance"],
    "Progressive Carrying Distance (Total)": df_old["Progressive Carrying Distance"],
    "Carries into Final Third": df_old["Carries into Final Third"],
    "Carries into Penalty Area": df_old["Carries into Penalty Area"],

    "Take-Ons Attempted": df_old["Take-Ons Attempted"],
    "Successful Take-Ons": df_old["Successful Take-Ons"],
    "Times Tackled During Take-On": df_old["Times Tackled During Take-On"],

    "Tackles": df_old["Tackles"],
    "Tackles Won": df_old["Tackles Won"],
    "Tackles in Defensive Third": df_old["Tackles in Defensive Third"],
    "Tackles in Middle Third": df_old["Tackles in Middle Third"],
    "Tackles in Attacking Third": df_old["Tackles in Attacking Third"],

    "Dribblers Tackled": df_old["Dribblers Tackled"],
    "Dribbles Challenged": df_old["Dribbles Challenged"],
    "Challenges Lost": df_old["Challenges Lost"],

    "Interceptions": df_old["Interceptions"],
    "Blocks": df_old["Blocks"],
    "Shots Blocked": df_old["Shots Blocked"],
    "Passes Blocked": df_old["Passes Blocked"],
    "Clearances": df_old["Clearances"],
    "Errors Leading to Shot": df_old["Errors Leading to Shot"],

    "Ball Recoveries": df_old["Ball Recoveries"],
    "Miscontrols": df_old["Miscontrols"],
    "Dispossessed": df_old["Dispossessed"],
    "Ball Losses": df_old["Dispossessed"] + df_old["Miscontrols"],
    "Passes Received": df_old["Passes Received"],
    "Progressive Passes Received": df_old["Progressive Passes Received"],

    "Touches": df_old["Touches"],
    "Touches in Defensive Penalty Area": df_old["Touches in Defensive Penalty Area"],
    "Touches in Defensive Third": df_old["Touches in Defensive Third"],
    "Touches in Middle Third": df_old["Touches in Middle Third"],
    "Touches in Attacking Third": df_old["Touches in Attacking Third"],
    "Touches in Attacking Penalty Area": df_old["Touches in Attacking Penalty Area"],
    "Live-Ball Touches": df_old["Live-Ball Touches"],

    "Yellow Cards": df_old["Yellow Cards"],
    "Red Cards": df_old["Red Cards"],
    "Second Yellow Card": df_old["Second Yellow Card"],
    "Fouls Committed": df_old["Fouls Committed"],
    "Fouls Drawn": df_old["Fouls Drawn"],
    "Offsides": df_old["Offsides"],
    "Penalties Kicks Conceded": df_old["Penalty Kicks Conceded"],
    "Penalties Kicks Won": df_old["Penalty Kicks Won"],
    "Own Goals": df_old["Own Goals"],

    "Aerials Won": df_old["Aerials Won"],
    "Aerials Lost": df_old["Aerials Lost"],
    "Total Aerials": df_old["Aerials Won"] + df_old["Aerials Lost"]
})

new_data["Age"] = new_data["Age"].apply(clean_age).astype("Int64")
new_data["Nationality"] = new_data["Nationality"].apply(clean_nationality)
new_data.replace([np.inf, -np.inf], 0, inplace=True)
new_data.fillna(0, inplace=True)

for col in new_data.select_dtypes(include=['float64', 'int64']).columns:
    new_data[col] = new_data[col].round(1)

In [22]:
new_data['Position'] = new_data['Position'].apply(new_position)
new_data['General Position'] = new_data['Position'].apply(gen_position)
new_data = new_data[(new_data["Position"] != "GK") & (new_data["Position"] != "")]
new_data.to_csv(path_end, index=False)