In [1]:
import pandas as pd
import os
from pathlib import Path

RAW_DIR = Path("../raw")
OUTPUT_DIR = Path("../cleaned")
OUTPUT_DIR.mkdir(exist_ok=True)

# Base columns from football-data.co.uk
base_columns = [
    "Div", "Date", "Time", "HomeTeam", "AwayTeam",
    "FTHG", "FTAG", "FTR",
    "HTHG", "HTAG", "HTR",
    "Referee",
    "HS", "AS", "HST", "AST", "HF", "AF", "HC", "AC",
    "HY", "AY", "HR", "AR"
]

# These may exist only in Understat:
optional_xg_columns = ["xG", "xGA", "home_xG", "away_xG"]

def clean_file(input_path):
    filename = input_path.name
    print(f"\nðŸ“„ Cleaning: {filename}")

    # Load with wide columns â€” some files have more than 24 columns
    df_raw = pd.read_csv(input_path, encoding="latin1", low_memory=False)

    # Find which xG columns exist
    detected_xg_cols = [col for col in optional_xg_columns if col in df_raw.columns]

    # Select the columns we want to keep
    keep_columns = [
        "Date", "HomeTeam", "AwayTeam",
        "FTHG", "FTAG", "FTR",
        "HS", "AS", "HST", "AST",
        "HF", "AF", "HC", "AC",
        "HY", "AY", "HR", "AR"
    ]

    # Append any detected xG columns
    keep_columns += detected_xg_cols

    # Keep only the columns that exist in the file
    keep_columns = [col for col in keep_columns if col in df_raw.columns]

    df = df_raw[keep_columns].copy()

    # Clean date
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
        df = df.dropna(subset=["Date"])

    # Build output filename
    output_name = filename.replace(".csv", " (cleaned).csv")
    output_file = OUTPUT_DIR / output_name

    df.to_csv(output_file, index=False)
    print(f"âœ… Saved: {output_file.name}")
    return df


# Loop through all CSVs in /raw
csv_files = list(RAW_DIR.glob("*.csv"))
print(f"Found {len(csv_files)} raw files.")

for file in csv_files:
    clean_file(file)

print("\nðŸŽ‰ All files cleaned successfully!")



Found 6 raw files.

ðŸ“„ Cleaning: Prem 2021:2022.csv
âœ… Saved: Prem 2021:2022 (cleaned).csv

ðŸ“„ Cleaning: Prem 2020:2021.csv
âœ… Saved: Prem 2020:2021 (cleaned).csv

ðŸ“„ Cleaning: Prem 2023:2024.csv
âœ… Saved: Prem 2023:2024 (cleaned).csv

ðŸ“„ Cleaning: Prem 2022:2023.csv
âœ… Saved: Prem 2022:2023 (cleaned).csv

ðŸ“„ Cleaning: Prem 2025:2026.csv
âœ… Saved: Prem 2025:2026 (cleaned).csv

ðŸ“„ Cleaning: Prem 2024:2025.csv
âœ… Saved: Prem 2024:2025 (cleaned).csv

ðŸŽ‰ All files cleaned successfully!
