In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
season_code = "25_26"
current_dir = Path.cwd()  
path_folder = current_dir.parent.parent.parent.parent
path_start  = os.path.join(path_folder, f"csv/csv{season_code}/raw data/data_goals.csv")
path_end    = os.path.join(path_folder, f"Github/csv/csv{season_code}/players/clean/data_goals.csv")

In [3]:
def clean_age(age_str):
    if isinstance(age_str, str) and "-" in age_str:
        return int(age_str.split("-")[0])
    elif isinstance(age_str, (int, float)):
        return int(age_str)

In [4]:
def clean_nationality(nat):
    return str(nat).split()[-1].upper()

In [5]:
df_old = pd.read_csv(path_start, index_col = False)

In [6]:
new_data = pd.DataFrame({
    "Player": df_old["Player"],
    "Game Week": df_old["Game Week"],
    "Team": df_old["Team"],
    "League": df_old["League"],
    "Nationality": df_old["Nationality"],
    "Position": "GK",
    "General Position": "Goalkeeper",
    "Age": df_old["Age"],
    "Minutes": df_old["Minutes"],
    
    "Shots on Target Against": df_old["Shots on Target Against"],
    "Goals Against": df_old["Goals Against"],
    "Saves": df_old["Saves"],
    "Post-Shot Expected Goals (PSxG)": df_old["Post-Shot Expected Goals (PSxG)"],
    
    "Save Efficiency": np.round(df_old["Post-Shot Expected Goals (PSxG)"] - df_old["Goals Against"], 1),
    "Clean Sheets": (df_old["Goals Against"] == 0).astype(int),

    "Completed Long Passes": df_old["Passes Completed (Launched)"],
    "Attempted Long Passes": df_old["Passes Attempted (Launched)"],
    "Attempted Passes (excluding GK)": df_old["Passes Attempted (GK)"],
    "Attempted Throws": df_old["Throws Attempted"],
    "Pass Length (Total)": round(df_old["Average Pass Length"]*df_old["Passes Attempted (GK)"], 1),
    
    "Attempted Goal Kicks": df_old["Goal Kicks Attempted"],
    "Goal Kick Length (Total)": round(df_old["Average Goal Kick Length"]*df_old["Goal Kicks Attempted"], 1),
    
    "Crosses Faced": df_old["Crosses Faced"],
    "Crosses Stopped": df_old["Crosses Stopped"],

    "Defensive Actions Outside Penalty Area": df_old["Def. Actions Outside Penalty Area"],
    "Distance of Defensive Actions (Total)": np.round(df_old["Average Distance of Def. Actions"]*df_old["Def. Actions Outside Penalty Area"], 1)
})

new_data["Age"] = new_data["Age"].apply(clean_age).astype("Int64")
new_data["Nationality"] = new_data["Nationality"].apply(clean_nationality)
new_data.replace([np.inf, -np.inf], 0, inplace=True)
new_data.fillna(0, inplace=True)


In [7]:
new_data.to_csv(path_end, index=False)