In [None]:
import pandas as pd
import numpy as np
import json

def convert_height(h):
    """Convert height from various formats to inches"""
    try:
        if isinstance(h, str):
            if '-' in h:  # '6-10' format
                ft, inch = h.split('-')
                return int(ft)*12 + int(inch)
            elif 'cm' in h:  # Metric system
                cm = float(h.replace('cm', ''))
                return round(cm / 2.54, 1)
        return float(h)
    except:
        return np.nan

def nba_data_pipeline():
    # ----------------------------
    # 1. Load Data with Validation
    # ----------------------------
    try:
        stats = pd.read_csv('Seasons_Stats.csv')
        players = pd.read_csv('Players.csv')
        shot_logs = pd.read_csv('shot_logs.csv')
        teams = pd.read_csv('teams.csv')  # Conference data
        print(f"Loaded {len(stats)} stats, {len(players)} players, {len(shot_logs)} shots, {len(teams)} teams")
    except Exception as e:
        print(f"Error: {str(e)}")
        return

    # ----------------------------
    # 2. Create Conference Mapping
    # ----------------------------
    team_to_conf = teams.set_index('abbreviation')['conference'].to_dict()

    # ----------------------------
    # 3. Clean Players Data
    # ----------------------------
    players_clean = (
        players.rename(columns={'collage': 'college'})
        .assign(
            height=lambda df: df['height'].apply(convert_height),
            weight=lambda df: pd.to_numeric(df['weight'], errors='coerce'),
            born=lambda df: pd.to_numeric(df['born'], errors='coerce')
        )
        .dropna(subset=['height', 'weight'])  # Remove players missing vital stats
        [['Player', 'height', 'weight', 'college', 'born']]
        .drop_duplicates('Player')
    )
    print(f"Cleaned players: {len(players_clean)}")

    # ----------------------------
    # 4. Process Season Stats (with Conference)
    # ----------------------------
    stats['Year'] = pd.to_numeric(stats['Year'], errors='coerce')
    
    stats_clean = (
        stats.dropna(subset=['Player', 'Year'])
        .query("Year >= 2000")
        .rename(columns={
            'G': 'Games',
            'MP': 'Minutes',
            'TRB': 'Rebounds'
        })
        .assign(
            PPG=lambda df: df['PTS'] / df['Games'].replace(0, 1),
            Efficiency=lambda df: (df['PTS'] + df['AST'] + df['Rebounds']) / 
                                (df['Minutes'].replace(0, 0.1) + 0.1),
            Conference=lambda df: df['Tm'].map(team_to_conf).fillna('Unknown')  # Conference mapping
        )
        [['Player', 'Pos', 'Age', 'Tm', 'Conference', 'Games', 'Minutes', 
          'PPG', 'AST', 'Rebounds', 'STL', 'BLK', 'Efficiency']]
    )
    print(f"Cleaned stats: {len(stats_clean)}")

    # ----------------------------
    # 5. Process Shot Logs
    # ----------------------------
    if not shot_logs.empty:
        shot_mapping = {
            'player_name': 'Player',
            'SHOT_RESULT': 'made',
            'SHOT_DIST': 'distance',
            'CLOSE_DEF_DIST': 'defender_dist'
        }
        shots_clean = (
            shot_logs.rename(columns=shot_mapping)
            .assign(made=lambda df: df['made'] == 'made')
            .groupby('Player')
            .apply(lambda x: x.to_dict('records'))
            .reset_index(name='shots')
        )
    else:
        shots_clean = pd.DataFrame(columns=['Player', 'shots'])
    print(f"Cleaned shots: {len(shots_clean)}")

    # ----------------------------
    # 6. Merge Data
    # ----------------------------
    merged = (
        stats_clean.merge(players_clean, on='Player', how='left')
        .merge(shots_clean, on='Player', how='left')
        .assign(
            height=lambda df: df['height'].fillna(78).clip(60, 96),
            weight=lambda df: df['weight'].fillna(220).clip(150, 350),
            born=lambda df: df['born'].fillna(1990).astype(int).clip(1950, 2010),
            shots=lambda df: df['shots'].apply(lambda x: x if isinstance(x, list) else [])
        )
        .query("Pos in ['G','PG','SG','F','PF','SF','C']")
    )
    print(f"Final merged data: {len(merged)}")

    # ----------------------------
    # 7. Export to JSON
    # ----------------------------
    merged.to_json('nba_processed1.json', orient='records', indent=2)
    print("Processing complete. Sample data:", merged.iloc[0].to_dict())

if __name__ == '__main__':
    nba_data_pipeline()

Loaded 24691 stats, 3922 players, 128069 shots, 30 teams
Cleaned players: 3921
Cleaned stats: 10204
Cleaned shots: 281
Final merged data: 10049
Processing complete. Sample data: {'Player': 'Tariq Abdul-Wahad', 'Pos': 'SG', 'Age': 25.0, 'Tm': 'TOT', 'Conference': 'Unknown', 'Games': 61.0, 'Minutes': 1578.0, 'PPG': 11.426229508196721, 'AST': 98.0, 'Rebounds': 291.0, 'STL': 59.0, 'BLK': 28.0, 'Efficiency': 0.6881693175337431, 'height': 96.0, 'weight': 150.0, 'college': 'San Jose State University', 'born': 1974, 'shots': []}
