In [193]:
import pandas as pd
import numpy as np

def load_and_prepare_data():
    """
    Loads and aggregates player and team stats to the season level.
    """
    # Load datasets
    player_stats = pd.read_csv('/Users/reececalvin/heat/PlayerStatistics.csv').query('gameDate >= "2015-01-01" and gameType in ["Regular Season", "NBA Cup", "NBA Emirates Cup"]')
    player_stats['season'] = pd.to_datetime(player_stats['gameDate']).dt.year

    reg_season_ids = player_stats['gameId'].unique()
    team_stats = pd.read_csv('/Users/reececalvin/heat/TeamStatistics.csv').query('gameDate >= "2015-01-01" and gameId in @reg_season_ids')

    # Aggregate player data to season level
    player_season_stats = (
        player_stats
        .groupby(['personId', 'season'])
        .agg({
            'firstName': 'first', 'lastName': 'first','playerteamCity': 'first', 'playerteamName': 'first',
            'gameId': 'count', 'numMinutes': 'sum', 'points': 'sum', 'assists': 'sum',
            'reboundsTotal': 'sum', 'reboundsOffensive': 'sum', 'reboundsDefensive': 'sum',
            'steals': 'sum', 'blocks': 'sum', 'fieldGoalsAttempted': 'sum', 'fieldGoalsMade': 'sum',
            'threePointersAttempted': 'sum', 'threePointersMade': 'sum', 'freeThrowsAttempted': 'sum',
            'freeThrowsMade': 'sum', 'turnovers': 'sum', 'foulsPersonal': 'sum', 'plusMinusPoints':'sum'
        })
        .reset_index()
    )

    # Safely calculate shooting percentages to avoid division by zero
    player_season_stats['fg_percentage'] = (
        player_season_stats['fieldGoalsMade'] / player_season_stats['fieldGoalsAttempted'].replace(0, 1)
    )
    player_season_stats['three_pt_percentage'] = (
        player_season_stats['threePointersMade'] / player_season_stats['threePointersAttempted'].replace(0, 1)
    )
    player_season_stats['ft_percentage'] = (
        player_season_stats['freeThrowsMade'] / player_season_stats['freeThrowsAttempted'].replace(0, 1)
    )

    # Team identifier logic
    player_season_stats['team_full_name'] = player_season_stats['playerteamCity'] + ' ' + player_season_stats['playerteamName'] + ' (' + player_season_stats['season'].astype(str) + ')'
    player_season_stats['team_id'] = pd.Categorical(player_season_stats['team_full_name']).codes + 1

    # Team data aggregation
    team_season_records = (
        team_stats
        .groupby(['teamCity', 'teamName'])
        .agg({'win': 'sum', 'gameId': 'count'})
        .reset_index()
    )
    team_season_records.columns = ['team_city', 'team_name', 'wins', 'total_games']
    team_season_records['team_full_name'] = team_season_records['team_city'] + ' ' + team_season_records['team_name']
    team_season_records['team_id'] = pd.Categorical(team_season_records['team_full_name']).codes + 1
    team_season_records['win_pct'] = team_season_records['wins'] / team_season_records['total_games']

    return player_season_stats, team_season_records


def prepare_stan_data(player_data, team_data):
    """
    Prepares data for a hierarchical Stan model that predicts team wins.
    """

    # Create a consistent team_id mapping across both dataframes
    all_teams = player_data_filtered['team_full_name'].unique()
    team_map = {name: i+1 for i, name in enumerate(all_teams)}

    player_data_filtered['team_idx'] = player_data_filtered['team_full_name'].map(team_map)

    # We need to get the team wins in the same order as our map
    team_data_ordered = pd.DataFrame({'team_full_name': all_teams})
    team_data_ordered = team_data_ordered.merge(
        team_data[['team_full_name', 'wins']],
        on='team_full_name',
        how='left'
    )
    # 1. Define which columns to convert into per-36-minute rates.
    stats_to_normalize = {
        'plusMinusPoints': 'plus_minus_p36',
        'points': 'points_p36',
        'assists': 'assists_p36',
        'reboundsOffensive': 'oreb_p36',
        'reboundsDefensive': 'dreb_p36',
        'steals': 'steals_p36',
        'blocks': 'blocks_p36',
        'turnovers': 'turnovers_p36',
        'fieldGoalsAttempted': 'fga_p36',
        'freeThrowsAttempted': 'fta_p36'
    }

    # 2. Calculate the rate for each stat.
    for original_col, new_col in stats_to_normalize.items():
        player_data_filtered[new_col] = (
                                                player_data_filtered[original_col] / player_data_filtered['numMinutes']
                                        ) * 36

    # 3. Create the final dictionary for Stan.
    stan_data = {
        'N_players': len(player_data_filtered),
        'N_teams': len(team_map),

        'player_team_id': player_data_filtered['team_idx'].values,

        # Target Variable
        'plus_minus_p36': player_data_filtered['plus_minus_p36'].values,
        'total_minutes': player_data_filtered['numMinutes'].values,

        # Predictor Variables
        'points_p36': player_data_filtered['points_p36'].values,
        'assists_p36': player_data_filtered['assists_p36'].values,
        'oreb_p36': player_data_filtered['oreb_p36'].values,
        'dreb_p36': player_data_filtered['dreb_p36'].values,
        'steals_p36': player_data_filtered['steals_p36'].values,
        'blocks_p36': player_data_filtered['blocks_p36'].values,
        'turnovers_p36': player_data_filtered['turnovers_p36'].values,
        'fga_p36': player_data_filtered['fga_p36'].values,
        'fta_p36': player_data_filtered['fta_p36'].values,
        'team_wins': team_data_ordered['wins'].values.astype(int)
    }

    return stan_data, player_data_filtered


player_df, team_df = load_and_prepare_data()
stan_data_final, player_data_filtered = prepare_stan_data(player_df, team_df)
print("Stan data prepared successfully for Per-36 model!")
print("Keys in dictionary:", stan_data_final.keys())

  player_stats = pd.read_csv('/Users/reececalvin/heat/PlayerStatistics.csv').query('gameDate >= "2015-01-01" and gameType in ["Regular Season", "NBA Cup", "NBA Emirates Cup"]')


Stan data prepared successfully for Per-36 model!
Keys in dictionary: dict_keys(['N_players', 'N_teams', 'player_team_id', 'plus_minus_p36', 'total_minutes', 'points_p36', 'assists_p36', 'oreb_p36', 'dreb_p36', 'steals_p36', 'blocks_p36', 'turnovers_p36', 'fga_p36', 'fta_p36', 'team_wins'])


In [194]:
player_data_filtered.to_csv('player_season_stats_filtered.csv', index=False)

In [195]:
import json
with open('stan_data_final.json', 'w') as f:
    json.dump({k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in stan_data_final.items()}, f)