In [None]:
# Import necessary libraries
import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats
import time

# Define the seasons we want to analyze
seasons = ['2022-23', '2023-24', '2024-25']
all_players_df = pd.DataFrame()

print("Fetching data for seasons...")
for season in seasons:
    try:
        print(f"-> Processing {season}...")
        # Get advanced stats for all players for the specified season
        player_stats = leaguedashplayerstats.LeagueDashPlayerStats(
            season=season,
            per_mode_detailed='Per100Possessions', # We want possession-adjusted stats!
            season_type_all_star='Regular Season'
        )
        
        # Convert the result to a pandas DataFrame
        df = player_stats.get_data_frames()[0]
        df['SEASON'] = season # Add a season column for reference
        
        # Concatenate with our main dataframe
        all_players_df = pd.concat([all_players_df, df], ignore_index=True)
        
        # Be a good API citizen
        time.sleep(1) 
        
    except Exception as e:
        print(f"Error fetching data for {season}: {e}")

print("Data fetching complete.")
print(f"Shape of the raw data: {all_players_df.shape}")
all_players_df.head()

Fetching data for seasons...
-> Processing 2022-23...
-> Processing 2023-24...
-> Processing 2024-25...
Data fetching complete.
Shape of the raw data: (4623, 68)


Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,SEASON
0,1962938000.0,,,1610613000.0,DEN,,3,1,2,0.333,...,736,557,1326,597,1188,568,64,1354,1,2022-23
1,196294100.0,,,1610613000.0,DAL,,2,0,2,0.0,...,30,1442,1462,56,451,568,64,898,1,2022-23
2,1628932.0,A'ja Wilson,A'ja,1611661000.0,LVA,26.0,36,26,10,0.722,...,1047,74,93,81,14,42,64,27,1,2022-23
3,1629467.0,A.J. Hess,A.J.,1612710000.0,SCW,29.0,1,1,0,1.0,...,1221,1442,1479,661,1412,568,64,1465,1,2022-23
4,1630639.0,A.J. Lawson,A.J.,1610613000.0,DAL,22.0,20,5,15,0.25,...,557,577,432,1388,880,404,64,647,2,2022-23


In [None]:
# Convert relevant columns to numeric, errors='coerce' will turn non-numeric values into NaN
numeric_cols = ['MIN', 'GP', 'PTS', 'AST', 'REB', 'STL', 'BLK', 'FGA', 'FG3A']
for col in numeric_cols:
    all_players_df[col] = pd.to_numeric(all_players_df[col], errors='coerce')

# Filter for players who played a minimum number of minutes
MIN_THRESHOLD = 800 # Roughly 10 minutes per game over a full season
qualified_players_df = all_players_df[all_players_df['MIN'] >= MIN_THRESHOLD].copy()

print(f"Shape after filtering for MIN >= {MIN_THRESHOLD}: {qualified_players_df.shape}")

# Check for any remaining missing values
print("\nMissing values check:")
print(qualified_players_df.isnull().sum().sort_values(ascending=False).head())

Shape after filtering for MIN >= 800: (0, 68)

Missing values check:
PLAYER_ID            0
PLAYER_NAME          0
NICKNAME             0
TEAM_ID              0
TEAM_ABBREVIATION    0
dtype: int64
