In [87]:
# Install necessary libraries (run once)
%pip install nba_api pandas scikit-learn numpy xgboost

Note: you may need to restart the kernel to use updated packages.


In [88]:
# Import libraries
import pandas as pd
from nba_api.stats.endpoints import playergamelog, commonplayerinfo, leaguegamefinder, leaguedashteamstats
from nba_api.stats.static import players, teams
import time 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

In [89]:
# Get all players
nba_players = players.get_players()
players_df = pd.DataFrame(nba_players)
print(f"Total players found: {len(players_df)}")
players_df.head()

Total players found: 5024


Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False


In [90]:
# Get all teams
nba_teams = teams.get_teams()
teams_df = pd.DataFrame(nba_teams)
print(f"Total teams found: {len(teams_df)}")
teams_df.head()

Total teams found: 30


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [91]:
# Fetch Team Defensive Stats for the season
print("Fetching team defensive stats...")
team_def_stats_df = pd.DataFrame() 
try:
    team_stats = leaguedashteamstats.LeagueDashTeamStats(
        season='2023-24',
        measure_type_detailed_defense='Defense' 
    )
    temp_df = team_stats.get_data_frames()[0]
    
    print("Available columns in team stats:")
    print(temp_df.columns) 

    identifier_column = 'TEAM_NAME' 
    
    if identifier_column not in temp_df.columns:
         if 'TEAM_ABBREVIATION' in temp_df.columns:
              identifier_column = 'TEAM_ABBREVIATION'
              print(f"Using '{identifier_column}' as identifier.")
         elif 'TEAM_ID' in temp_df.columns:
              identifier_column = 'TEAM_ID'
              print(f"Using '{identifier_column}' as identifier.")
         else:
              raise KeyError(f"Could not find a suitable team identifier column. Available: {temp_df.columns}")

    team_def_stats_df = temp_df[[identifier_column, 'DEF_RATING']].copy()
    
    if identifier_column in ['TEAM_NAME', 'TEAM_ID'] and 'teams_df' in locals():
         merge_left_col = 'TEAM_NAME' if identifier_column == 'TEAM_NAME' else 'TEAM_ID'
         merge_right_col = 'full_name' if identifier_column == 'TEAM_NAME' else 'id'
         
         if identifier_column == 'TEAM_ID':
             team_def_stats_df[identifier_column] = team_def_stats_df[identifier_column].astype(int)
             teams_df['id'] = teams_df['id'].astype(int)

         team_def_stats_df = pd.merge(team_def_stats_df, teams_df[['id', 'full_name', 'abbreviation']], left_on=merge_left_col, right_on=merge_right_col, how='left')
         
         if 'abbreviation' in team_def_stats_df.columns:
             team_def_stats_df = team_def_stats_df[['abbreviation', 'DEF_RATING']].rename(columns={'abbreviation': 'TEAM_ABBREVIATION'})
         else:
              print("Warning: Could not find 'abbreviation' after merging with teams_df.")
              team_def_stats_df = pd.DataFrame() 
              
    elif identifier_column == 'TEAM_ABBREVIATION':
         pass 
    else:
         print(f"Warning: Identifier column '{identifier_column}' might require manual handling for merging later.")


    if not team_def_stats_df.empty:
        print("\nTeam defensive stats processed.")
        print(team_def_stats_df.head())
    elif 'TEAM_ABBREVIATION' in temp_df.columns: 
         print("\nProcessing stats using TEAM_ABBREVIATION directly.")
         team_def_stats_df = temp_df[['TEAM_ABBREVIATION', 'DEF_RATING']].copy()
         print(team_def_stats_df.head())
    else:
         print("\nCould not process team defensive stats correctly.")

except Exception as e:
    print(f"\nError fetching or processing team defensive stats: {e}")
    team_def_stats_df = pd.DataFrame() 

Fetching team defensive stats...
Available columns in team stats:
Index(['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'DEF_RATING',
       'DREB', 'DREB_PCT', 'STL', 'BLK', 'OPP_PTS_OFF_TOV',
       'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'DEF_RATING_RANK',
       'DREB_RANK', 'DREB_PCT_RANK', 'STL_RANK', 'BLK_RANK',
       'OPP_PTS_OFF_TOV_RANK', 'OPP_PTS_2ND_CHANCE_RANK', 'OPP_PTS_FB_RANK',
       'OPP_PTS_PAINT_RANK'],
      dtype='object')

Team defensive stats processed.
  TEAM_ABBREVIATION  DEF_RATING
0               ATL       118.4
1               BOS       110.6
2               BKN       115.4
3               CHA       119.2
4               CHI       115.7


In [92]:
# Function to get game logs for a player and season with delay
def get_player_log(player_id, season='2023-24'):
    print(f"Fetching logs for player {player_id}...")
    try:
        log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        df = log.get_data_frames()[0]
        time.sleep(0.6) 
        return df
    except Exception as e:
        print(f"Error fetching logs for player {player_id}: {e}")
        time.sleep(0.6)
        return pd.DataFrame() 

# Get active players' IDs
active_players_df = players_df[players_df['is_active'] == True]
active_player_ids = active_players_df['id'].tolist()

# --- Fetching game logs (can take time) ---
player_ids_to_fetch = active_player_ids[:50] 

all_gamelogs_df = pd.DataFrame()

for p_id in player_ids_to_fetch:
    player_log_df = get_player_log(p_id, season='2023-24')
    if not player_log_df.empty:
        all_gamelogs_df = pd.concat([all_gamelogs_df, player_log_df], ignore_index=True)

print(f"\nFetched logs for {len(player_ids_to_fetch)} players.")
print(f"Total game logs retrieved: {len(all_gamelogs_df)}")
all_gamelogs_df.head()

Fetching logs for player 1630173...
Fetching logs for player 203500...
Fetching logs for player 1628389...
Fetching logs for player 1630534...
Fetching logs for player 1630583...
Fetching logs for player 1641725...
Fetching logs for player 1629638...
Fetching logs for player 1628960...
Fetching logs for player 1628386...
Fetching logs for player 1630631...
Fetching logs for player 203937...
Fetching logs for player 203507...
Fetching logs for player 1630175...
Fetching logs for player 1628384...
Fetching logs for player 1642379...
Fetching logs for player 1630166...
Fetching logs for player 1629028...
Fetching logs for player 1630542...
Fetching logs for player 1628963...
Fetching logs for player 1631116...
Fetching logs for player 1630163...
Fetching logs for player 1628366...
Fetching logs for player 1628964...
Fetching logs for player 1631094...
Fetching logs for player 1630217...
Fetching logs for player 1630625...
Fetching logs for player 1631230...
Fetching logs for player 203084

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22023,1630173,22301190,"APR 14, 2024",NYK vs. CHI,W,19,2,3,0.667,...,4,5,2,0,1,2,3,4,-2,1
1,22023,1630173,22301175,"APR 12, 2024",NYK vs. BKN,W,8,2,2,1.0,...,3,3,0,0,0,0,0,5,3,1
2,22023,1630173,22301167,"APR 11, 2024",NYK @ BOS,W,16,1,6,0.167,...,3,5,0,0,1,1,0,2,-9,1
3,22023,1630173,22301139,"APR 07, 2024",NYK @ MIL,W,5,0,1,0.0,...,0,0,0,0,0,0,1,0,5,1
4,22023,1630173,22301119,"APR 05, 2024",NYK @ CHI,L,19,0,2,0.0,...,3,4,1,1,0,1,4,0,-2,1


In [93]:
# --- Data Preprocessing ---

processed_df = all_gamelogs_df.copy()

processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])

# Select relevant columns (including FGA)
relevant_cols = ['Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 
                 'MIN', 'PTS', 'REB', 'AST', 'FG3M', 'STL', 'BLK', 'TOV', 'FGA'] 
processed_df = processed_df[relevant_cols]

def parse_matchup(matchup_str):
    if '@' in matchup_str:
        parts = matchup_str.split(' @ ')
        opponent = parts[1]
        home_away = 'Away'
    elif 'vs.' in matchup_str:
        parts = matchup_str.split(' vs. ')
        opponent = parts[1]
        home_away = 'Home'
    else: 
        opponent = 'Unknown'
        home_away = 'Unknown'
    return opponent, home_away

processed_df[['Opponent', 'Home_Away']] = processed_df['MATCHUP'].apply(
    lambda x: pd.Series(parse_matchup(x))
)

processed_df = processed_df.sort_values(by=['Player_ID', 'GAME_DATE'])

processed_df.head()

  processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])


Unnamed: 0,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,REB,AST,FG3M,STL,BLK,TOV,FGA,Opponent,Home_Away
1656,201587,22300074,2023-10-25,LAC vs. POR,W,9,0,3,1,0,1,1,0,0,POR,Home
1655,201587,22300085,2023-10-27,LAC @ UTA,L,30,8,2,3,2,1,3,0,4,UTA,Away
1654,201587,22300099,2023-10-29,LAC vs. SAS,W,15,0,2,1,0,1,0,1,4,SAS,Home
1653,201587,22300148,2023-11-06,PHI vs. WAS,W,17,11,2,0,3,1,0,0,5,WAS,Home
1652,201587,22300159,2023-11-08,PHI vs. BOS,W,26,5,4,1,1,0,1,0,3,BOS,Home


In [94]:
# --- Feature Engineering ---

# Rolling Averages
processed_df['PTS_Roll_3'] = processed_df.groupby('Player_ID')['PTS'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
)
processed_df['PTS_Roll_5'] = processed_df.groupby('Player_ID')['PTS'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
)
processed_df['MIN_Roll_3'] = processed_df.groupby('Player_ID')['MIN'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
)
processed_df['MIN_Roll_5'] = processed_df.groupby('Player_ID')['MIN'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
)
processed_df['FGA_Roll_3'] = processed_df.groupby('Player_ID')['FGA'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
)
processed_df['FGA_Roll_5'] = processed_df.groupby('Player_ID')['FGA'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
)

# Cumulative Season Averages (Shifted)
processed_df['Cum_PTS'] = processed_df.groupby('Player_ID')['PTS'].transform(lambda x: x.expanding().sum().shift(1))
processed_df['Cum_MIN'] = processed_df.groupby('Player_ID')['MIN'].transform(lambda x: x.expanding().sum().shift(1))
processed_df['Cum_Games'] = processed_df.groupby('Player_ID').cumcount() # Already shifted due to 0-based index

processed_df['Avg_PTS_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_Games']).replace([np.inf, -np.inf], 0)
processed_df['PTS_Per36_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_MIN'] * 36).replace([np.inf, -np.inf], 0)

# Other Features
processed_df['Is_Home'] = processed_df['Home_Away'].apply(lambda x: 1 if x == 'Home' else 0)

# Merge Opponent Stats
if 'team_def_stats_df' in locals() and not team_def_stats_df.empty and 'TEAM_ABBREVIATION' in team_def_stats_df.columns:
    team_def_stats_to_merge = team_def_stats_df.rename(columns={
        'TEAM_ABBREVIATION': 'Opponent',
        'DEF_RATING': 'Opponent_DEF_RATING'
    })
    processed_df = pd.merge(processed_df, team_def_stats_to_merge, on='Opponent', how='left')
    avg_def_rating = team_def_stats_to_merge['Opponent_DEF_RATING'].mean()
    processed_df['Opponent_DEF_RATING'].fillna(avg_def_rating, inplace=True)
    print("Opponent defensive stats merged.")
else:
    print("Warning: Team defensive stats DataFrame not found or missing 'TEAM_ABBREVIATION'. Skipping merge.")
    processed_df['Opponent_DEF_RATING'] = 115.0 

# Final Fill NA
processed_df.fillna(0, inplace=True)

processed_df.head(10) 

Opponent defensive stats merged.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Opponent_DEF_RATING'].fillna(avg_def_rating, inplace=True)


Unnamed: 0,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,REB,AST,FG3M,...,MIN_Roll_5,FGA_Roll_3,FGA_Roll_5,Cum_PTS,Cum_MIN,Cum_Games,Avg_PTS_Season,PTS_Per36_Season,Is_Home,Opponent_DEF_RATING
0,201587,22300074,2023-10-25,LAC vs. POR,W,9,0,3,1,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1,116.6
1,201587,22300085,2023-10-27,LAC @ UTA,L,30,8,2,3,2,...,9.0,0.0,0.0,0.0,9.0,1,0.0,0.0,0,119.6
2,201587,22300099,2023-10-29,LAC vs. SAS,W,15,0,2,1,0,...,19.5,2.0,2.0,8.0,39.0,2,4.0,7.384615,1,115.6
3,201587,22300148,2023-11-06,PHI vs. WAS,W,17,11,2,0,3,...,18.0,2.666667,2.666667,8.0,54.0,3,2.666667,5.333333,1,118.9
4,201587,22300159,2023-11-08,PHI vs. BOS,W,26,5,4,1,1,...,17.75,4.333333,3.25,19.0,71.0,4,4.75,9.633803,1,110.6
5,201587,22300008,2023-11-10,PHI @ DET,W,26,3,3,3,1,...,19.4,4.0,3.2,24.0,97.0,5,4.8,8.907216,0,118.0
6,201587,22300180,2023-11-12,PHI vs. IND,W,33,9,7,0,3,...,22.8,3.333333,3.6,27.0,123.0,6,4.5,7.902439,1,117.6
7,201587,22300208,2023-11-19,PHI @ BKN,W,24,3,3,3,1,...,23.4,3.333333,3.8,36.0,156.0,7,5.142857,8.307692,0,115.4
8,201587,22300040,2023-11-21,PHI vs. CLE,L,27,5,4,2,1,...,25.2,2.666667,3.2,39.0,180.0,8,4.875,7.8,1,112.1
9,201587,22300232,2023-11-22,PHI @ MIN,L,24,6,3,0,2,...,27.2,3.666667,3.2,44.0,207.0,9,4.888889,7.652174,0,108.4


In [95]:
# --- Model Building (Ridge Regression - Points Prediction) ---

required_cols = ['PTS_Roll_3', 'PTS_Roll_5', 'MIN_Roll_3', 'MIN_Roll_5', 'FGA_Roll_3', 'FGA_Roll_5', 'Avg_PTS_Season', 'PTS_Per36_Season']
if 'Opponent_DEF_RATING' in processed_df.columns:
    required_cols.append('Opponent_DEF_RATING')

model_df = processed_df.dropna(subset=required_cols).copy() 

if 'Opponent_DEF_RATING' in model_df.columns:
    model_df['Opponent_DEF_RATING'] = pd.to_numeric(model_df['Opponent_DEF_RATING'], errors='coerce')
    if model_df['Opponent_DEF_RATING'].isnull().any():
        mean_def_rating = model_df['Opponent_DEF_RATING'].mean()
        print(f"Filling NaN Opponent_DEF_RATING with mean: {mean_def_rating}")
        model_df['Opponent_DEF_RATING'].fillna(mean_def_rating, inplace=True)

if model_df.empty:
    print("Not enough data with calculated rolling features to build a model.")
else:
    features = ['PTS_Roll_3', 'PTS_Roll_5', 'MIN_Roll_3', 'MIN_Roll_5', 'Is_Home', 'FGA_Roll_3', 'FGA_Roll_5', 'Avg_PTS_Season', 'PTS_Per36_Season']
    if 'Opponent_DEF_RATING' in model_df.columns:
         features.append('Opponent_DEF_RATING')
            
    target = 'PTS'
    
    X = model_df[features]
    y = model_df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Testing set size: {X_test.shape[0]}")
    
    # Initialize and Train the Ridge Regression Model
    model = Ridge(alpha=1.0) 
    model.fit(X_train, y_train)
    
    print("\nRidge Regression Model training complete.")

Training set size: 1881
Testing set size: 471

Ridge Regression Model training complete.


In [96]:
# --- Model Evaluation ---

if 'model' in locals() and not model_df.empty:
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred) 
    rmse = np.sqrt(mse) 

    print(f"\nModel Evaluation Results:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}") 

    X_test_results = X_test.copy()
    X_test_results['Actual_PTS'] = y_test
    X_test_results['Predicted_PTS'] = y_pred
    print("\nSample Predictions vs Actual:")
    print(X_test_results.head())
else:
    print("\nSkipping model evaluation as model was not trained or model_df was empty.")


Model Evaluation Results:
Mean Absolute Error (MAE): 4.78
Root Mean Squared Error (RMSE): 6.08

Sample Predictions vs Actual:
      PTS_Roll_3  PTS_Roll_5  MIN_Roll_3  MIN_Roll_5  Is_Home  FGA_Roll_3  \
1960    3.000000         3.4   12.000000        11.2        1    2.666667   
668    12.666667        12.0   25.333333        25.0        0    6.000000   
2082    5.666667         4.8   20.333333        18.0        1    4.000000   
808    16.000000        15.4   39.333333        36.0        1   12.666667   
1907    8.000000         8.0   20.666667        22.4        0    6.000000   

      FGA_Roll_5  Avg_PTS_Season  PTS_Per36_Season  Opponent_DEF_RATING  \
1960         2.8        3.200000         12.521739                112.4   
668          6.2       12.000000         17.280000                116.6   
2082         3.4        5.272727         11.347826                115.6   
808         12.4       19.455882         20.345152                114.9   
1907         7.0       12.277778   