# NBA Data

In [1]:
!pip install nba_api

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
!pip show nba_api

Name: nba_api
Version: 1.8.0
Summary: An API Client package to access the APIs for NBA.com
Home-page: 
Author: Swar Patel
Author-email: <swar.m.patel@gmail.com>
License: MIT
Location: /Users/lucas007/miniconda3/lib/python3.12/site-packages
Requires: numpy, requests
Required-by: 


### Import the packages

In [10]:
from nba_api.stats.endpoints import commonplayerinfo, playercareerstats, leaguegamefinder
from nba_api.stats.static import players
import pandas as pd
import time

In [11]:
# Get list of all NBA players
nba_players = players.get_active_players()

# Convert to DataFrame for filtering
players_df = pd.DataFrame(nba_players)

# Optional: Filter current players
# players_df = players_df[players_df['is_active'] == True]

# Preview
players_df.head(20)

Unnamed: 0,id,full_name,first_name,last_name,is_active
0,1630173,Precious Achiuwa,Precious,Achiuwa,True
1,203500,Steven Adams,Steven,Adams,True
2,1628389,Bam Adebayo,Bam,Adebayo,True
3,1630534,Ochai Agbaji,Ochai,Agbaji,True
4,1630583,Santi Aldama,Santi,Aldama,True
5,1641725,Trey Alexander,Trey,Alexander,True
6,1629638,Nickeil Alexander-Walker,Nickeil,Alexander-Walker,True
7,1628960,Grayson Allen,Grayson,Allen,True
8,1628386,Jarrett Allen,Jarrett,Allen,True
9,1630631,Jose Alvarado,Jose,Alvarado,True


In [12]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          565 non-null    int64 
 1   full_name   565 non-null    object
 2   first_name  565 non-null    object
 3   last_name   565 non-null    object
 4   is_active   565 non-null    bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 18.3+ KB


In [8]:
def get_player_stats(player_id):
    try:
        career = playercareerstats.PlayerCareerStats(player_id=player_id)
        df = career.get_data_frames()[0]
        df['PLAYER_ID'] = player_id
        return df
    except:
        return None

In [None]:
import pandas as pd
import time
import os
from tqdm import tqdm
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.static import players


# ---------------------------------------
# Load Player List
# ---------------------------------------
players_df = pd.DataFrame(players.get_active_players())
players_df = players_df.rename(columns={'id': 'PLAYER_ID'})

# ---------------------------------------
# Load Cached Results (if available)
# ---------------------------------------
if os.path.exists("career_stats_cached.csv"):
    cached_df = pd.read_csv("career_stats_cached.csv")
    if cached_df.empty:
        all_stats = []
        done_ids = set()
    else:
        all_stats = [cached_df]
        done_ids = set(cached_df['PLAYER_ID'].unique())
else:
    all_stats = []
    done_ids = set()

# Track failed attempts
failed_ids = []

# ---------------------------------------
# Function with Retry
# ---------------------------------------
def get_player_stats(player_id, retries=3, delay=5):
    for attempt in range(retries):
        try:
            career = playercareerstats.PlayerCareerStats(player_id=player_id)
            df = career.get_data_frames()[0]
            df['PLAYER_ID'] = player_id
            return df
        except Exception as e:
            print(f"Attempt {attempt+1} failed for {player_id}: {e}")
            time.sleep(delay)
    return None  # All attempts failed

# ---------------------------------------
# Loop Over All Players
# ---------------------------------------
for i, row in tqdm(players_df.iterrows(), total=len(players_df)):
    pid = row['PLAYER_ID']

    if pid in done_ids:
        continue

    stats = get_player_stats(pid)

    if stats is not None and not stats.empty:
        all_stats.append(stats)

        # Only save if non-empty
        non_empty_stats = [df for df in all_stats if df is not None and not df.empty]
        if non_empty_stats:
            pd.concat(non_empty_stats, ignore_index=True).to_csv("career_stats_cached.csv", index=False)
    else:
        failed_ids.append(pid)

    # Delay to avoid API throttling
    time.sleep(1.5)

# ---------------------------------------
# Save Failed IDs for Retry
# ---------------------------------------
if failed_ids:
    with open("failed_ids.txt", "w") as f:
        for pid in failed_ids:
            f.write(f"{pid}\n")

  0%|                                                   | 0/565 [00:00<?, ?it/s]

In [15]:
career_stats_df = pd.read_csv("career_stats_cached.csv")
career_stats_df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,1630173,2020-21,0,1610612748,MIA,21.0,61,4,737.0,124,...,0.509,73,135,208,29,20,28,43,91,304
1,1630173,2021-22,0,1610612761,TOR,22.0,73,28,1725.0,265,...,0.595,146,327,473,82,37,41,84,151,664
2,1630173,2022-23,0,1610612761,TOR,23.0,55,12,1141.0,196,...,0.702,100,228,328,50,31,30,59,102,508
3,1630173,2023-24,0,1610612761,TOR,24.0,25,0,437.0,78,...,0.571,50,86,136,44,16,12,29,40,193
4,1630173,2023-24,0,1610612752,NYK,24.0,49,18,1187.0,157,...,0.643,141,210,351,53,30,56,54,103,372


In [17]:
# Rename 'id' to 'PLAYER_ID' to match
players_df = players_df.rename(columns={'id': 'PLAYER_ID'})

# Merge on PLAYER_ID
merged_df = pd.merge(career_stats_df, players_df, on='PLAYER_ID', how='left')

# Preview result
merged_df.head(20)

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,AST,STL,BLK,TOV,PF,PTS,full_name,first_name,last_name,is_active
0,1630173,2020-21,0,1610612748,MIA,21.0,61,4,737.0,124,...,29,20,28,43,91,304,Precious Achiuwa,Precious,Achiuwa,True
1,1630173,2021-22,0,1610612761,TOR,22.0,73,28,1725.0,265,...,82,37,41,84,151,664,Precious Achiuwa,Precious,Achiuwa,True
2,1630173,2022-23,0,1610612761,TOR,23.0,55,12,1141.0,196,...,50,31,30,59,102,508,Precious Achiuwa,Precious,Achiuwa,True
3,1630173,2023-24,0,1610612761,TOR,24.0,25,0,437.0,78,...,44,16,12,29,40,193,Precious Achiuwa,Precious,Achiuwa,True
4,1630173,2023-24,0,1610612752,NYK,24.0,49,18,1187.0,157,...,53,30,56,54,103,372,Precious Achiuwa,Precious,Achiuwa,True
5,1630173,2023-24,0,0,TOT,24.0,74,18,1624.0,235,...,97,46,68,83,143,565,Precious Achiuwa,Precious,Achiuwa,True
6,1630173,2024-25,0,1610612752,NYK,25.0,57,10,1170.0,164,...,55,47,42,45,81,379,Precious Achiuwa,Precious,Achiuwa,True
7,203500,2013-14,0,1610612760,OKC,20.0,81,20,1197.0,93,...,43,40,57,71,203,265,Steven Adams,Steven,Adams,True
8,203500,2014-15,0,1610612760,OKC,21.0,70,67,1771.0,217,...,66,38,86,99,222,537,Steven Adams,Steven,Adams,True
9,203500,2015-16,0,1610612760,OKC,22.0,80,80,2014.0,261,...,62,42,89,84,223,636,Steven Adams,Steven,Adams,True


In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542 entries, 0 to 2541
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PLAYER_ID          2542 non-null   int64  
 1   SEASON_ID          2542 non-null   object 
 2   LEAGUE_ID          2542 non-null   int64  
 3   TEAM_ID            2542 non-null   int64  
 4   TEAM_ABBREVIATION  2542 non-null   object 
 5   PLAYER_AGE         2542 non-null   float64
 6   GP                 2542 non-null   int64  
 7   GS                 2542 non-null   int64  
 8   MIN                2542 non-null   float64
 9   FGM                2542 non-null   int64  
 10  FGA                2542 non-null   int64  
 11  FG_PCT             2542 non-null   float64
 12  FG3M               2542 non-null   int64  
 13  FG3A               2542 non-null   int64  
 14  FG3_PCT            2542 non-null   float64
 15  FTM                2542 non-null   int64  
 16  FTA                2542 

### Calculate average stats

In [20]:
total_stats = [
    'MIN', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
    'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'
]

In [21]:
for col in total_stats:
    merged_df[col + '_PG'] = merged_df[col] / merged_df['GP']

In [23]:
merged_df[['full_name', 'SEASON_ID', 'GP', 'PTS_PG', 'REB_PG', 'AST_PG']].head(10)

Unnamed: 0,full_name,SEASON_ID,GP,PTS_PG,REB_PG,AST_PG
0,Precious Achiuwa,2020-21,61,4.983607,3.409836,0.47541
1,Precious Achiuwa,2021-22,73,9.09589,6.479452,1.123288
2,Precious Achiuwa,2022-23,55,9.236364,5.963636,0.909091
3,Precious Achiuwa,2023-24,25,7.72,5.44,1.76
4,Precious Achiuwa,2023-24,49,7.591837,7.163265,1.081633
5,Precious Achiuwa,2023-24,74,7.635135,6.581081,1.310811
6,Precious Achiuwa,2024-25,57,6.649123,5.561404,0.964912
7,Steven Adams,2013-14,81,3.271605,4.098765,0.530864
8,Steven Adams,2014-15,70,7.671429,7.471429,0.942857
9,Steven Adams,2015-16,80,7.95,6.6625,0.775


In [None]:
merged_df.drop(columns=total_stats, inplace=True)

In [30]:
merged_df.drop(columns=['first_name','last_name','is_active','LEAGUE_ID'], inplace=True)

 The stats are correct after confirming with NBA official stats

In [31]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542 entries, 0 to 2541
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PLAYER_ID          2542 non-null   int64  
 1   SEASON_ID          2542 non-null   object 
 2   TEAM_ID            2542 non-null   int64  
 3   TEAM_ABBREVIATION  2542 non-null   object 
 4   PLAYER_AGE         2542 non-null   float64
 5   GP                 2542 non-null   int64  
 6   GS                 2542 non-null   int64  
 7   FG_PCT             2542 non-null   float64
 8   FG3_PCT            2542 non-null   float64
 9   FT_PCT             2542 non-null   float64
 10  full_name          2542 non-null   object 
 11  MIN_PG             2542 non-null   float64
 12  FGM_PG             2542 non-null   float64
 13  FGA_PG             2542 non-null   float64
 14  FG3M_PG            2542 non-null   float64
 15  FG3A_PG            2542 non-null   float64
 16  FTM_PG             2542 

In [32]:
merged_df.to_csv("nba_career_stats_per_game.csv", index=False)