## Load Packages & Set Paths

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

REPO_ROOT = Path().resolve().parents[0]
APP_ROOT = REPO_ROOT / "app"
DATA_DIR = APP_ROOT / "data" / "raw"

## Transforms

In [2]:
def get_clean_label_mapping(table_name: str) -> dict:
    """Get clean label mapping for given table."""

    with open(DATA_DIR / 'table_metadata.json', 'r') as file:
        table_metadata = json.load(file)

    table_dict = table_metadata["models"].get(table_name, {})
    columns_dict = table_dict.get("columns", {})

    label_mapping = {}

    for col in columns_dict.keys():
        label = columns_dict[col]["label"] or ""
        label_mapping[col] = label

    return label_mapping


def clean_numeric_columns(df: pd.DataFrame, decimals: int = 1) -> pd.DataFrame:
    """Gerneral-purpose step to clean numeric columns."""

    df = df.copy()

    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            series = df[col].dropna()

            if series.empty:
                continue

            # Check if values are effectively integers
            is_integer_like = np.all(np.isclose(series % 1, 0))

            if is_integer_like:
                # Use pandas nullable integer
                df[col] = df[col].round(0).astype("Int64")
            else:
                df[col] = df[col].round(decimals)

    return df


def title_case_columns(df: pd.DataFrame, columns = list[str]) -> pd.DataFrame:
    """Covert lower case columns with underscores to title case."""
    df = df.copy()
    for col in columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace("_", " ")
            .str.title()
        )        
    return df

## Data Stores

In [3]:
def get_parquet_table(pq_table: str, params: list, clean_labels: bool) -> pd.DataFrame:
    """Get DataFrame for given Parquet table."""

    pq_path = DATA_DIR / f"{pq_table}.parquet"

    # Read table & apply any filtering
    df = pd.read_parquet(pq_path, filters=params)

    # Rename headers to "nice" user-friendly labels
    if clean_labels:        
        label_mapping = get_clean_label_mapping(table_name)
        df.rename(columns=label_mapping, inplace=True)

    return df


# Example usage
table_name = "fct_goalkeeper_performance"
pq_table = table_name
params = None
clean_labels = True

df = get_parquet_table(table_name, None, clean_labels)
df = clean_numeric_columns(df)
df = title_case_columns(df, ["Goalkeeper"])
df.head()

Unnamed: 0,Goalkeeper,Team,Matches Played,Clean Sheets,Goals Against,Saves,Shots on Target Against,Save %,PSxG − GA,Crosses Faced (per 90),Crosses Stopped %,Pass Attempts (per 90),Long Kick Pass Completion %,Def. Actions OPA (per 90),Avg. Distance of Def. Actions
0,David Raya,Arsenal,19,9,12,31,43,72.1,-1.1,9.8,11.3,35.7,33.2,2.0,21.5
1,Jordan Pickford,Everton,19,8,20,50,71,70.4,2.8,16.2,3.2,38.9,36.8,2.1,18.2
2,Robert Sanchez,Chelsea,19,8,19,44,63,69.8,1.2,12.9,13.8,42.9,30.0,1.3,17.2
3,Djordje Petrovic,Bournemouth,19,5,35,53,87,60.9,-5.1,14.0,8.3,25.6,31.1,1.9,16.4
4,Bart Verbruggen,Brighton,19,4,27,54,79,68.4,0.0,14.0,4.6,45.4,25.0,1.4,16.8
