In [1]:
import pandas as pd
from pathlib import Path
import re


In [6]:
DATA_DIR = Path("nba_boxscores")
OUTPUT_DIR = Path("nba_boxscores/unified")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def extract_home_team_from_filename(filename: str) -> str:
    """
    Extracts the home team from filenames like:
    team_basic_202504080CLE.csv -> CLE
    """
    m = re.search(r"([A-Z]{3})\.csv$", filename)
    if not m:
        raise ValueError(f"Cannot infer home team from {filename}")
    return m.group(1)


def build_game_key(game_date: str, game_id: str, home_team: str) -> str:
    """
    YYYY-MM-DD:YYYYAWAY@HOME
    """
    season_year = game_date[:4]

    # game_id format: YYYYMMDD0HOME
    inferred_home = game_id[-3:]
    if inferred_home != home_team:
        # filename is authoritative
        pass

    # Determine away team dynamically
    # Caller must provide rows grouped by game_id
    return season_year, home_team


In [5]:
def unify_table(file_pattern: str, table_name: str):
    files = list(DATA_DIR.glob(file_pattern))
    if not files:
        print(f"No files found for {table_name}")
        return

    all_games = []

    for file in files:
        df = pd.read_csv(file)
        home_team = extract_home_team_from_filename(file.name)

        # Safety
        if "game_id" not in df.columns or "team" not in df.columns:
            raise ValueError(f"{file.name} missing required columns")

        for game_id, g in df.groupby("game_id"):
            teams = g["team"].unique().tolist()
            if len(teams) != 2:
                continue  # corrupted or partial scrape

            away_team = [t for t in teams if t != home_team]
            away_team = away_team[0] if away_team else teams[0]

            game_date = g["game_date"].iloc[0]
            season_year = game_date[:4]

            game_key = f"{game_date}:{season_year}{away_team}@{home_team}"

            g = g.copy()
            g["game_key"] = game_key
            g["home_team"] = home_team
            g["away_team"] = away_team

            all_games.append(g)

    unified = pd.concat(all_games, ignore_index=True)
    unified.to_csv(OUTPUT_DIR / f"{table_name}.csv", index=False)
    print(f"Written {table_name}.csv ({len(unified):,} rows)")


In [7]:
unify_table("four_factors_*.csv", "four_factors")
unify_table("line_scores_*.csv", "line_scores")
unify_table("quarters_*.csv", "quarters")
unify_table("team_basic_*.csv", "basic_boxscore")
unify_table("team_advanced_*.csv", "advanced_boxscore")


Written four_factors.csv (15,192 rows)
Written line_scores.csv (15,192 rows)
Written quarters.csv (554,053 rows)
Written basic_boxscore.csv (161,443 rows)
