In [3]:
import pandas as pd
import os

UNDERSTAT_PATH = "../raw/Understat-data/game_stats.csv"
OUTPUT_DIR = "../cleaned/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Load Understat game stats
us = pd.read_csv(UNDERSTAT_PATH)

# 2. Filter to Premier League only
us_epl = us[us["league"] == "EPL"].copy()

# 3. Convert date column
us_epl["date"] = pd.to_datetime(us_epl["date"], errors="coerce")

# 4. Keep only useful columns (adjust if you want more/less)
keep_cols = [
    "league",
    "season",
    "date",
    "club_name",
    "home_away",
    "xG", "xGA",
    "npxG", "npxGA",
    "ppda", "ppda_allowed",
    "deep", "deep_allowed",
    "scored", "missed",
    "xpts",
    "result",
    "wins", "draws", "loses",
    "pts"
]

keep_cols = [c for c in keep_cols if c in us_epl.columns]
us_epl = us_epl[keep_cols].copy()

# 5. Basic cleaning: drop rows without date or xG
us_epl = us_epl.dropna(subset=["date", "xG"])

# 6. Save ONE combined cleaned file
combined_out = os.path.join(OUTPUT_DIR, "cleaned_understat_epl_game_stats.csv")
us_epl.to_csv(combined_out, index=False)
print(f"✅ Saved combined Understat EPL file: {combined_out}")

# 7. (Optional) Save ONE FILE PER SEASON
#    e.g. cleaned_understat_2020-2021.csv, etc.
for season, df_season in us_epl.groupby("season"):
    season_str = str(season)
    out_file = os.path.join(OUTPUT_DIR, f"cleaned_understat_{season_str}.csv")
    df_season.to_csv(out_file, index=False)
    print(f"✅ Saved season file: {out_file}")

us_epl.head()



✅ Saved combined Understat EPL file: ../cleaned/cleaned_understat_epl_game_stats.csv
✅ Saved season file: ../cleaned/cleaned_understat_2014.csv
✅ Saved season file: ../cleaned/cleaned_understat_2015.csv
✅ Saved season file: ../cleaned/cleaned_understat_2016.csv
✅ Saved season file: ../cleaned/cleaned_understat_2017.csv
✅ Saved season file: ../cleaned/cleaned_understat_2018.csv
✅ Saved season file: ../cleaned/cleaned_understat_2019.csv
✅ Saved season file: ../cleaned/cleaned_understat_2020.csv
✅ Saved season file: ../cleaned/cleaned_understat_2021.csv
✅ Saved season file: ../cleaned/cleaned_understat_2022.csv
✅ Saved season file: ../cleaned/cleaned_understat_2023.csv
✅ Saved season file: ../cleaned/cleaned_understat_2024.csv


Unnamed: 0,league,season,date,club_name,home_away,xG,xGA,npxG,npxGA,ppda,...,deep,deep_allowed,scored,missed,xpts,result,wins,draws,loses,pts
0,EPL,2014,2014-08-16 15:00:00,Aston Villa,a,0.909774,0.423368,0.909774,0.423368,14.043478,...,4,3,1,0,1.8322,w,1,0,0,3
1,EPL,2014,2014-08-23 12:45:00,Aston Villa,h,0.507525,0.699295,0.507525,0.699295,15.52381,...,4,7,0,0,1.1057,d,0,1,0,1
2,EPL,2014,2014-08-31 13:30:00,Aston Villa,h,0.639316,0.28888,0.639316,0.28888,28.153846,...,6,7,2,1,1.6075,w,1,0,0,3
3,EPL,2014,2014-09-13 17:30:00,Aston Villa,a,0.701676,0.728097,0.701676,0.728097,54.0,...,1,5,1,0,1.3252,w,1,0,0,3
4,EPL,2014,2014-09-20 15:00:00,Aston Villa,h,0.649013,1.36224,0.649013,1.36224,44.25,...,0,7,0,3,0.6912,l,0,0,1,0
