In [15]:
# Install necessary libraries (run once)
%pip install nba_api pandas scikit-learn numpy xgboost

Note: you may need to restart the kernel to use updated packages.


In [16]:
# Import libraries
import pandas as pd
import os
from nba_api.stats.endpoints import playergamelog, commonplayerinfo, leaguegamefinder, leaguedashteamstats, leaguedashplayerstats
from nba_api.stats.static import players, teams
import time 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

In [17]:
# Get all players
nba_players = players.get_players()
players_df = pd.DataFrame(nba_players)
print(f"Total players found: {len(players_df)}")
players_df.head()

Total players found: 5024


Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False


In [18]:
# Get all teams
nba_teams = teams.get_teams()
teams_df = pd.DataFrame(nba_teams)
print(f"Total teams found: {len(teams_df)}")
teams_df.head()

Total teams found: 30


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [19]:
# Fetch Team Defensive Stats for the season
print("Fetching team defensive stats...")
team_def_stats_df = pd.DataFrame() 
try:
    team_stats = leaguedashteamstats.LeagueDashTeamStats(
        season='2023-24',
        measure_type_detailed_defense='Defense' 
    )
    temp_df = team_stats.get_data_frames()[0]
    
    print("Available columns in team stats:")
    print(temp_df.columns) 

    identifier_column = 'TEAM_NAME' 
    
    if identifier_column not in temp_df.columns:
         if 'TEAM_ABBREVIATION' in temp_df.columns:
              identifier_column = 'TEAM_ABBREVIATION'
              print(f"Using '{identifier_column}' as identifier.")
         elif 'TEAM_ID' in temp_df.columns:
              identifier_column = 'TEAM_ID'
              print(f"Using '{identifier_column}' as identifier.")
         else:
              raise KeyError(f"Could not find a suitable team identifier column. Available: {temp_df.columns}")

    team_def_stats_df = temp_df[[identifier_column, 'DEF_RATING']].copy()
    
    if identifier_column in ['TEAM_NAME', 'TEAM_ID'] and 'teams_df' in locals():
         merge_left_col = 'TEAM_NAME' if identifier_column == 'TEAM_NAME' else 'TEAM_ID'
         merge_right_col = 'full_name' if identifier_column == 'TEAM_NAME' else 'id'
         
         if identifier_column == 'TEAM_ID':
             team_def_stats_df[identifier_column] = team_def_stats_df[identifier_column].astype(int)
             teams_df['id'] = teams_df['id'].astype(int)

         team_def_stats_df = pd.merge(team_def_stats_df, teams_df[['id', 'full_name', 'abbreviation']], left_on=merge_left_col, right_on=merge_right_col, how='left')
         
         if 'abbreviation' in team_def_stats_df.columns:
             team_def_stats_df = team_def_stats_df[['abbreviation', 'DEF_RATING']].rename(columns={'abbreviation': 'TEAM_ABBREVIATION'})
         else:
              print("Warning: Could not find 'abbreviation' after merging with teams_df.")
              team_def_stats_df = pd.DataFrame() 
              
    elif identifier_column == 'TEAM_ABBREVIATION':
         pass 
    else:
         print(f"Warning: Identifier column '{identifier_column}' might require manual handling for merging later.")


    if not team_def_stats_df.empty:
        print("\nTeam defensive stats processed.")
        print(team_def_stats_df.head())
    elif 'TEAM_ABBREVIATION' in temp_df.columns: 
         print("\nProcessing stats using TEAM_ABBREVIATION directly.")
         team_def_stats_df = temp_df[['TEAM_ABBREVIATION', 'DEF_RATING']].copy()
         print(team_def_stats_df.head())
    else:
         print("\nCould not process team defensive stats correctly.")

except Exception as e:
    print(f"\nError fetching or processing team defensive stats: {e}")
    team_def_stats_df = pd.DataFrame() 

Fetching team defensive stats...
Available columns in team stats:
Index(['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'DEF_RATING',
       'DREB', 'DREB_PCT', 'STL', 'BLK', 'OPP_PTS_OFF_TOV',
       'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'DEF_RATING_RANK',
       'DREB_RANK', 'DREB_PCT_RANK', 'STL_RANK', 'BLK_RANK',
       'OPP_PTS_OFF_TOV_RANK', 'OPP_PTS_2ND_CHANCE_RANK', 'OPP_PTS_FB_RANK',
       'OPP_PTS_PAINT_RANK'],
      dtype='object')

Team defensive stats processed.
  TEAM_ABBREVIATION  DEF_RATING
0               ATL       118.4
1               BOS       110.6
2               BKN       115.4
3               CHA       119.2
4               CHI       115.7


In [20]:
# Function to get game logs for a player and season with delay
def get_player_log(player_id, season='2023-24'):
    print(f"Fetching logs for player {player_id}...")
    try:
        # Note: PlayerGameLog endpoint provides FGA, PTS, and FTA needed for TS%
        log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        df = log.get_data_frames()[0]
        time.sleep(0.6) # NBA API rate limit
        return df
    except Exception as e:
        print(f"Error fetching logs for player {player_id}: {e}")
        time.sleep(0.6)
        return pd.DataFrame()

# --- Define Season and Output File ---
SEASON = '2023-24'
RAW_GAMELOG_FILE = f'nba_gamelogs_raw_{SEASON}.csv'
MIN_MINUTES_THRESHOLD = 15 # Minimum average minutes per game to be included

# --- Check if Processed Data Exists --- 
if os.path.exists(RAW_GAMELOG_FILE):
    print(f"Loading existing raw game logs from {RAW_GAMELOG_FILE}...")
    all_gamelogs_df = pd.read_csv(RAW_GAMELOG_FILE)
    # Ensure Player_ID is integer if loaded from CSV
    if 'Player_ID' in all_gamelogs_df.columns:
        all_gamelogs_df['Player_ID'] = all_gamelogs_df['Player_ID'].astype(int)
else:
    print("Raw game log file not found. Fetching data...")
    all_gamelogs_df = pd.DataFrame()
    
    # --- Filter Players Based on Season Stats (e.g., Minutes Played) ---
    print(f"Fetching player stats for {SEASON} to filter...")
    try:
        player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season=SEASON)
        player_stats_df = player_stats.get_data_frames()[0]
        time.sleep(0.6)
        
        # Filter players playing significant minutes
        relevant_players_df = player_stats_df[player_stats_df['MIN'] >= MIN_MINUTES_THRESHOLD]
        player_ids_to_fetch = relevant_players_df['PLAYER_ID'].unique().tolist()
        print(f"Found {len(player_ids_to_fetch)} players averaging >= {MIN_MINUTES_THRESHOLD} MPG.")
        
    except Exception as e:
        print(f"Error fetching player stats for filtering: {e}. Falling back to all active players.")
        # Fallback: Get all active players if stats fetch fails
        active_players_df = players_df[players_df['is_active'] == True]
        player_ids_to_fetch = active_players_df['id'].tolist()
        print(f"Fetching for all {len(player_ids_to_fetch)} active players (fallback).")
        
    # --- Fetching game logs for filtered players ---
    print(f"Fetching game logs for {len(player_ids_to_fetch)} players...")
    fetched_logs = [] # Collect dataframes in a list first
    for i, p_id in enumerate(player_ids_to_fetch):
        print(f"Progress: {i+1}/{len(player_ids_to_fetch)}")
        player_log_df = get_player_log(p_id, season=SEASON)
        if not player_log_df.empty:
            # Add Player_ID if it's missing (sometimes happens)
            if 'Player_ID' not in player_log_df.columns:
                 player_log_df['Player_ID'] = p_id
            fetched_logs.append(player_log_df)
            
    # --- Concatenate and Save the fetched data ---
    if fetched_logs:
        all_gamelogs_df = pd.concat(fetched_logs, ignore_index=True)
        print(f"\nSaving {len(all_gamelogs_df)} game logs to {RAW_GAMELOG_FILE}...")
        all_gamelogs_df.to_csv(RAW_GAMELOG_FILE, index=False)
        print("Save complete.")
    else:
        print("\nNo game logs were fetched or concatenated.")
        all_gamelogs_df = pd.DataFrame() # Ensure it's an empty DF if nothing was fetched

# --- Display results ---
if not all_gamelogs_df.empty:
    print(f"\nTotal game logs available: {len(all_gamelogs_df)}")
    print(f"Unique players in logs: {all_gamelogs_df['Player_ID'].nunique()}")
    # Check if FTA column exists after loading/fetching
    if 'FTA' in all_gamelogs_df.columns:
        print("FTA column successfully included.")
    else:
        print("Warning: FTA column is missing from the loaded/fetched data!")
    print(all_gamelogs_df.head())
else:
    print("\nall_gamelogs_df is empty.")

Raw game log file not found. Fetching data...
Fetching player stats for 2023-24 to filter...
Found 552 players averaging >= 15 MPG.
Fetching game logs for 552 players...
Progress: 1/552
Fetching logs for player 1630639...
Progress: 2/552
Fetching logs for player 1631260...
Progress: 3/552
Fetching logs for player 1631100...
Progress: 4/552
Fetching logs for player 203932...
Progress: 5/552
Fetching logs for player 1628988...
Progress: 6/552
Fetching logs for player 1630174...
Progress: 7/552
Fetching logs for player 1630598...
Progress: 8/552
Fetching logs for player 1641766...
Progress: 9/552
Fetching logs for player 1629678...
Progress: 10/552
Fetching logs for player 201143...
Progress: 11/552
Fetching logs for player 202692...
Progress: 12/552
Fetching logs for player 1630197...
Progress: 13/552
Fetching logs for player 1627936...
Progress: 14/552
Fetching logs for player 1641788...
Progress: 15/552
Fetching logs for player 203458...
Progress: 16/552
Fetching logs for player 163121

In [21]:
# --- Data Preprocessing ---

# Check if all_gamelogs_df exists and is not empty before proceeding
if 'all_gamelogs_df' in locals() and not all_gamelogs_df.empty:
    processed_df = all_gamelogs_df.copy()
    processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])

    # Select relevant columns (including FGA and FTA)
    # Ensure all expected columns exist, handle missing ones if necessary
    expected_cols = ['Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 
                     'MIN', 'PTS', 'REB', 'AST', 'FG3M', 'STL', 'BLK', 'TOV', 'FGA', 'FTA'] # Added FTA
    available_cols = [col for col in expected_cols if col in processed_df.columns]
    missing_cols = [col for col in expected_cols if col not in processed_df.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns: {missing_cols}. Proceeding with available columns.")
    processed_df = processed_df[available_cols]

    # Ensure necessary columns for TS% are numeric, coercing errors
    for col in ['PTS', 'FGA', 'FTA', 'MIN']:
        if col in processed_df.columns:
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
        else:
             print(f"Warning: Column {col} needed for processing is missing.")
             
    # Drop rows where essential numeric columns became NaN after coercion
    processed_df.dropna(subset=['PTS', 'FGA', 'FTA', 'MIN'], inplace=True)

    def parse_matchup(matchup_str):
        if pd.isna(matchup_str):
             return 'Unknown', 'Unknown'
        if '@' in matchup_str:
            parts = matchup_str.split(' @ ')
            opponent = parts[1]
            home_away = 'Away'
        elif 'vs.' in matchup_str:
            parts = matchup_str.split(' vs. ')
            opponent = parts[1]
            home_away = 'Home'
        else: 
            opponent = 'Unknown'
            home_away = 'Unknown'
        return opponent, home_away

    if 'MATCHUP' in processed_df.columns:
        processed_df[['Opponent', 'Home_Away']] = processed_df['MATCHUP'].apply(
            lambda x: pd.Series(parse_matchup(x))
        )
    else:
        print("Warning: 'MATCHUP' column not found. Cannot determine Opponent or Home/Away.")
        processed_df['Opponent'] = 'Unknown'
        processed_df['Home_Away'] = 'Unknown'

    processed_df = processed_df.sort_values(by=['Player_ID', 'GAME_DATE'])

    print("Data preprocessing complete.")
    print(processed_df.head())
else:
    print("Skipping Data Preprocessing because 'all_gamelogs_df' is not available or empty.")
    processed_df = pd.DataFrame() # Ensure processed_df exists even if empty

  processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])


Data preprocessing complete.
       Player_ID     Game_ID  GAME_DATE      MATCHUP WL  MIN  PTS  REB  AST  \
11862     200782  0022300075 2023-10-26    PHI @ MIL  L   26    0    7    0   
11861     200782  0022300092 2023-10-28    PHI @ TOR  W   21    3    3    0   
11860     200782  0022300098 2023-10-29  PHI vs. POR  W   20    3    4    0   
11859     200782  0022300127 2023-11-01    LAC @ LAL  L   21    0    5    0   
11858     200782  0022300151 2023-11-06    LAC @ NYK  L   11    2    1    0   

       FG3M  STL  BLK  TOV  FGA  FTA Opponent Home_Away  
11862     0    2    0    0    2    0      MIL      Away  
11861     1    0    0    1    1    0      TOR      Away  
11860     1    1    2    1    2    0      POR      Home  
11859     0    0    0    0    1    0      LAL      Away  
11858     0    1    0    0    2    2      NYK      Away  


In [22]:
# --- Feature Engineering ---

# Check if processed_df exists and is not empty
if 'processed_df' in locals() and not processed_df.empty:
    print("Starting Feature Engineering...")
    
    # Calculate True Shooting Percentage (TS%)
    # TS% = PTS / (2 * (FGA + 0.44 * FTA))
    # Handle division by zero: if denominator is 0, TS% is 0
    if 'PTS' in processed_df.columns and 'FGA' in processed_df.columns and 'FTA' in processed_df.columns:
        denominator = 2 * (processed_df['FGA'] + 0.44 * processed_df['FTA'])
        processed_df['TS%'] = np.where(denominator == 0, 0, processed_df['PTS'] / denominator)
        # Handle potential NaN/inf arising from calculation (though np.where should prevent division by zero)
        processed_df['TS%'].fillna(0, inplace=True)
        processed_df['TS%'].replace([np.inf, -np.inf], 0, inplace=True)
        print("Calculated TS%.")
    else:
        print("Warning: Could not calculate TS% due to missing PTS, FGA, or FTA columns.")
        processed_df['TS%'] = 0 # Assign default value if calculation fails

    # Rolling Averages (Ensure columns exist)
    for col in ['PTS', 'MIN', 'FGA', 'FTA', 'TS%']: # Added FTA and TS%
        if col in processed_df.columns:
            # Ensure column is numeric before rolling calculation
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
            processed_df[col].fillna(0, inplace=True) # Fill NaNs introduced by coercion
            
            processed_df[f'{col}_Roll_3'] = processed_df.groupby('Player_ID')[col].transform(
                lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
            )
            processed_df[f'{col}_Roll_5'] = processed_df.groupby('Player_ID')[col].transform(
                lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
            )
        else:
            print(f"Warning: Column '{col}' not found for rolling average calculation.")

    # Cumulative Season Averages (Shifted)
    if 'PTS' in processed_df.columns and 'MIN' in processed_df.columns and 'TS%' in processed_df.columns and 'FGA' in processed_df.columns and 'FTA' in processed_df.columns:
        processed_df['Cum_PTS'] = processed_df.groupby('Player_ID')['PTS'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_MIN'] = processed_df.groupby('Player_ID')['MIN'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_Games'] = processed_df.groupby('Player_ID').cumcount() 
        processed_df['Cum_FGA'] = processed_df.groupby('Player_ID')['FGA'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_FTA'] = processed_df.groupby('Player_ID')['FTA'].transform(lambda x: x.expanding().sum().shift(1))

        processed_df['Avg_PTS_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_Games']).replace([np.inf, -np.inf, np.nan], 0)
        processed_df['PTS_Per36_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_MIN'] * 36).replace([np.inf, -np.inf, np.nan], 0)
        
        # Cumulative TS% calculation
        cum_denominator = 2 * (processed_df['Cum_FGA'] + 0.44 * processed_df['Cum_FTA'])
        processed_df['Avg_TS%_Season'] = np.where(cum_denominator == 0, 0, processed_df['Cum_PTS'] / cum_denominator)
        processed_df['Avg_TS%_Season'].fillna(0, inplace=True)
        processed_df['Avg_TS%_Season'].replace([np.inf, -np.inf], 0, inplace=True)
        print("Calculated cumulative averages including Avg_TS%_Season.")
    else:
         print("Warning: One or more columns (PTS, MIN, TS%, FGA, FTA) missing for cumulative average calculation.")

    # Other Features
    if 'Home_Away' in processed_df.columns:
        processed_df['Is_Home'] = processed_df['Home_Away'].apply(lambda x: 1 if x == 'Home' else 0)
    else:
        processed_df['Is_Home'] = 0 # Default if Home_Away is missing

    # Merge Opponent Stats
    if 'team_def_stats_df' in locals() and not team_def_stats_df.empty and 'TEAM_ABBREVIATION' in team_def_stats_df.columns and 'Opponent' in processed_df.columns:
        team_def_stats_to_merge = team_def_stats_df.rename(columns={
            'TEAM_ABBREVIATION': 'Opponent',
            'DEF_RATING': 'Opponent_DEF_RATING'
        })
        # Ensure Opponent column types match if possible, handle errors
        try:
            processed_df['Opponent'] = processed_df['Opponent'].astype(team_def_stats_to_merge['Opponent'].dtype)
        except Exception as e:
            print(f"Warning: Could not align Opponent column types for merge: {e}")
            
        processed_df = pd.merge(processed_df, team_def_stats_to_merge[['Opponent', 'Opponent_DEF_RATING']], on='Opponent', how='left')
        
        if 'Opponent_DEF_RATING' in processed_df.columns:
             # Convert Opponent_DEF_RATING to numeric before filling NA
             processed_df['Opponent_DEF_RATING'] = pd.to_numeric(processed_df['Opponent_DEF_RATING'], errors='coerce')
             avg_def_rating = processed_df['Opponent_DEF_RATING'].mean() # Use mean from merged data
             processed_df['Opponent_DEF_RATING'].fillna(avg_def_rating, inplace=True)
             print("Opponent defensive stats merged.")
        else:
             print("Warning: 'Opponent_DEF_RATING' column not created after merge.")
             processed_df['Opponent_DEF_RATING'] = 115.0 # Fallback default
    else:
        print("Warning: Team defensive stats or Opponent column not available. Skipping merge.")
        processed_df['Opponent_DEF_RATING'] = 115.0 # Default value

    # Final Fill NA for engineered features
    # Identify all potential feature columns created
    feature_cols = [col for col in processed_df.columns if '_Roll_' in col or '_Season' in col or col == 'Is_Home' or col == 'Opponent_DEF_RATING']
    processed_df[feature_cols] = processed_df[feature_cols].fillna(0)
    print("Feature Engineering complete.")
    print(processed_df.head(10))
else:
    print("Skipping Feature Engineering because 'processed_df' is not available or empty.")

Starting Feature Engineering...
Calculated TS%.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['TS%'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['TS%'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

Calculated cumulative averages including Avg_TS%_Season.
Opponent defensive stats merged.
Feature Engineering complete.
   Player_ID     Game_ID  GAME_DATE      MATCHUP WL  MIN  PTS  REB  AST  FG3M  \
0     200782  0022300075 2023-10-26    PHI @ MIL  L   26    0    7    0     0   
1     200782  0022300092 2023-10-28    PHI @ TOR  W   21    3    3    0     1   
2     200782  0022300098 2023-10-29  PHI vs. POR  W   20    3    4    0     1   
3     200782  0022300127 2023-11-01    LAC @ LAL  L   21    0    5    0     0   
4     200782  0022300151 2023-11-06    LAC @ NYK  L   11    2    1    0     0   
5     200782  0022300160 2023-11-08    LAC @ BKN  L   21    3    6    3     0   
6     200782  0022300014 2023-11-10    LAC @ DAL  L    6    0    1    0     0   
7     200782  0022300179 2023-11-12  LAC vs. MEM  L   13    3    2    1     1   
8     200782  0022300024 2023-11-14    LAC @ DEN  L   21    6    5    0     2   
9     200782  0022300037 2023-11-17  LAC vs. HOU  W   15    0    0    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Avg_TS%_Season'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Avg_TS%_Season'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

In [23]:
# --- Prepare Data for Modeling ---

# Check if processed_df exists and is not empty
if 'processed_df' in locals() and not processed_df.empty:
    print("Preparing data for modeling...")
    # Define required columns based on features actually created
    required_cols = []
    # Add base features
    for col in ['PTS_Roll_3', 'PTS_Roll_5', 'MIN_Roll_3', 'MIN_Roll_5', 
                 'FGA_Roll_3', 'FGA_Roll_5', 'Avg_PTS_Season', 'PTS_Per36_Season',
                 'Opponent_DEF_RATING']:
        if col in processed_df.columns:
            required_cols.append(col)
        else:
            print(f"Note: Feature '{col}' not found in processed_df, excluding from model requirements.")
            
    # Add new TS% features
    for col in ['TS%_Roll_3', 'TS%_Roll_5', 'Avg_TS%_Season']:
         if col in processed_df.columns:
            required_cols.append(col)
         else:
            print(f"Note: TS% Feature '{col}' not found in processed_df, excluding from model requirements.")
            
    # Add target variable if it exists
    target = 'PTS'
    if target not in processed_df.columns:
        print(f"Error: Target variable '{target}' not found in processed_df. Cannot proceed with modeling.")
        model_df = pd.DataFrame()
        X_train, X_test, y_train, y_test = [None]*4
    else:
        # Drop rows where target or any required feature is missing BEFORE creating model_df
        model_df = processed_df.dropna(subset=[target] + required_cols).copy()

        # Convert Opponent_DEF_RATING to numeric if it exists and wasn't already
        if 'Opponent_DEF_RATING' in model_df.columns:
            model_df['Opponent_DEF_RATING'] = pd.to_numeric(model_df['Opponent_DEF_RATING'], errors='coerce')
            if model_df['Opponent_DEF_RATING'].isnull().any():
                mean_def_rating = model_df['Opponent_DEF_RATING'].mean()
                print(f"Filling NaN Opponent_DEF_RATING with mean: {mean_def_rating}")
                model_df['Opponent_DEF_RATING'].fillna(mean_def_rating, inplace=True)
        
        # Check if enough data remains
        if model_df.empty or len(model_df) < 10: # Arbitrary threshold for minimum data
            print("Not enough data with required features and target to build a model.")
            X_train, X_test, y_train, y_test = [None]*4 
        else:
            # Define features based on columns actually present in model_df
            features = required_cols + ['Is_Home'] # Add Is_Home if it exists
            features = [f for f in features if f in model_df.columns] # Ensure all features exist
            
            print(f"Using features: {features}")
            X = model_df[features]
            y = model_df[target]
            
            # Check for NaN/inf in features or target before split
            if X.isnull().values.any() or y.isnull().values.any() or np.isinf(X.values).any() or np.isinf(y.values).any():
                 print("Warning: NaN or Inf values detected in features or target before train/test split. Attempting to fill with 0.")
                 X = X.fillna(0)
                 y = y.fillna(0)
                 X = X.replace([np.inf, -np.inf], 0)
                 y = y.replace([np.inf, -np.inf], 0)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
            
            print(f"Data prepared for modeling. Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")
else:
    print("Skipping Data Preparation for Modeling because 'processed_df' is not available or empty.")
    X_train, X_test, y_train, y_test = [None]*4

Preparing data for modeling...
Using features: ['PTS_Roll_3', 'PTS_Roll_5', 'MIN_Roll_3', 'MIN_Roll_5', 'FGA_Roll_3', 'FGA_Roll_5', 'Avg_PTS_Season', 'PTS_Per36_Season', 'Opponent_DEF_RATING', 'TS%_Roll_3', 'TS%_Roll_5', 'Avg_TS%_Season', 'Is_Home']
Data prepared for modeling. Training set size: 13692, Testing set size: 3423


In [24]:
# --- Model Building & Tuning (Ridge Regression) ---
model_ridge = None
if X_train is not None and y_train is not None and not X_train.empty:
    print("Tuning Ridge Regression Model...")
    
    param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]}
    
    ridge_estimator = Ridge()
    grid_search = GridSearchCV(ridge_estimator, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    
    try:
        grid_search.fit(X_train, y_train)
        model_ridge = grid_search.best_estimator_
        print(f"Best alpha found: {grid_search.best_params_['alpha']}")
        print("Tuned Ridge Regression Model training complete.")
    except Exception as e:
        print(f"Error during Ridge GridSearchCV: {e}")
        model_ridge = None # Ensure model is None if fitting fails
else:
    print("Skipping Ridge tuning due to lack of training data.")

Tuning Ridge Regression Model...
Best alpha found: 20.0
Tuned Ridge Regression Model training complete.


In [25]:
# --- Model Evaluation (Tuned Ridge Regression) ---
print("\n--- Tuned Ridge Regression Evaluation ---")
if model_ridge is not None and X_test is not None and y_test is not None and not X_test.empty:
    try:
        y_pred_ridge = model_ridge.predict(X_test)

        mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
        mse_ridge = mean_squared_error(y_test, y_pred_ridge) 
        rmse_ridge = np.sqrt(mse_ridge) 

        print(f"Mean Absolute Error (MAE): {mae_ridge:.2f}")
        print(f"Root Mean Squared Error (RMSE): {rmse_ridge:.2f}") 

        X_test_results_ridge = X_test.copy()
        X_test_results_ridge['Actual_PTS'] = y_test
        X_test_results_ridge['Predicted_PTS'] = y_pred_ridge
        print("\nSample Predictions vs Actual:")
        print(X_test_results_ridge.head())
    except Exception as e:
        print(f"Error during Ridge evaluation: {e}")
else:
    print("Skipping Ridge evaluation as model was not trained or test data was missing.")


--- Tuned Ridge Regression Evaluation ---
Mean Absolute Error (MAE): 4.58
Root Mean Squared Error (RMSE): 6.04

Sample Predictions vs Actual:
       PTS_Roll_3  PTS_Roll_5  MIN_Roll_3  MIN_Roll_5  FGA_Roll_3  FGA_Roll_5  \
16242   14.333333        12.2   33.000000        29.8   12.000000        11.2   
13685    2.000000         1.2   18.333333        16.4    2.000000         1.4   
5963     5.000000         6.6   13.333333        15.4    3.666667         4.4   
2609    19.333333        16.8   34.000000        34.4   16.666667        13.8   
16571    9.000000         8.2   19.000000        20.6    7.000000         7.8   

       Avg_PTS_Season  PTS_Per36_Season  Opponent_DEF_RATING  TS%_Roll_3  \
16242        7.360000         13.714286                115.4    0.512435   
13685        1.900000          4.684932                115.4    0.446712   
5963         6.000000         14.644068                115.6    0.677101   
2609        21.000000         21.429921                119.2    0.

In [26]:
# --- Model Building (XGBoost) ---
model_xgb = None
if X_train is not None and y_train is not None and not X_train.empty:
    print("\nTraining XGBoost Model...")
    try:
        model_xgb = XGBRegressor(random_state=42) 
        model_xgb.fit(X_train, y_train)
        print("XGBoost Model training complete.")
    except Exception as e:
        print(f"Error during XGBoost training: {e}")
        model_xgb = None # Ensure model is None if training fails
else:
    print("Skipping XGBoost training due to lack of training data.")


Training XGBoost Model...
XGBoost Model training complete.


In [27]:
# --- Model Evaluation (XGBoost) ---
print("\n--- XGBoost Evaluation ---")
if model_xgb is not None and X_test is not None and y_test is not None and not X_test.empty:
    try:
        y_pred_xgb = model_xgb.predict(X_test)

        mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
        mse_xgb = mean_squared_error(y_test, y_pred_xgb) 
        rmse_xgb = np.sqrt(mse_xgb) 

        print(f"Mean Absolute Error (MAE): {mae_xgb:.2f}")
        print(f"Root Mean Squared Error (RMSE): {rmse_xgb:.2f}") 

        X_test_results_xgb = X_test.copy()
        X_test_results_xgb['Actual_PTS'] = y_test
        X_test_results_xgb['Predicted_PTS'] = y_pred_xgb
        print("\nSample Predictions vs Actual:")
        print(X_test_results_xgb.head())
    except Exception as e:
        print(f"Error during XGBoost evaluation: {e}")
else:
    print("Skipping XGBoost evaluation as model was not trained or test data was missing.")


--- XGBoost Evaluation ---
Mean Absolute Error (MAE): 4.75
Root Mean Squared Error (RMSE): 6.30

Sample Predictions vs Actual:
       PTS_Roll_3  PTS_Roll_5  MIN_Roll_3  MIN_Roll_5  FGA_Roll_3  FGA_Roll_5  \
16242   14.333333        12.2   33.000000        29.8   12.000000        11.2   
13685    2.000000         1.2   18.333333        16.4    2.000000         1.4   
5963     5.000000         6.6   13.333333        15.4    3.666667         4.4   
2609    19.333333        16.8   34.000000        34.4   16.666667        13.8   
16571    9.000000         8.2   19.000000        20.6    7.000000         7.8   

       Avg_PTS_Season  PTS_Per36_Season  Opponent_DEF_RATING  TS%_Roll_3  \
16242        7.360000         13.714286                115.4    0.512435   
13685        1.900000          4.684932                115.4    0.446712   
5963         6.000000         14.644068                115.6    0.677101   
2609        21.000000         21.429921                119.2    0.466598   
16571