In [68]:
# Import libraries
import pandas as pd
import os
from nba_api.stats.endpoints import playergamelog, commonplayerinfo, leaguegamefinder, leaguedashteamstats, leaguedashplayerstats
from nba_api.stats.static import players, teams
import time 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

In [69]:
# Get all players
nba_players = players.get_players()
players_df = pd.DataFrame(nba_players)
print(f"Total players found: {len(players_df)}")
players_df.head()

Total players found: 5024


Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False


In [70]:
# Get all teams
nba_teams = teams.get_teams()
teams_df = pd.DataFrame(nba_teams)
print(f"Total teams found: {len(teams_df)}")
teams_df.head()

Total teams found: 30


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [71]:
# Fetch Team Defensive Stats for the season
print("Fetching team defensive stats...")
team_def_stats_df = pd.DataFrame() 
try:
    team_stats = leaguedashteamstats.LeagueDashTeamStats(
        season='2023-24',
        measure_type_detailed_defense='Defense' 
    )
    temp_df = team_stats.get_data_frames()[0]
    
    print("Available columns in team stats:")
    print(temp_df.columns) 

    identifier_column = 'TEAM_NAME' 
    
    if identifier_column not in temp_df.columns:
         if 'TEAM_ABBREVIATION' in temp_df.columns:
              identifier_column = 'TEAM_ABBREVIATION'
              print(f"Using '{identifier_column}' as identifier.")
         elif 'TEAM_ID' in temp_df.columns:
              identifier_column = 'TEAM_ID'
              print(f"Using '{identifier_column}' as identifier.")
         else:
              raise KeyError(f"Could not find a suitable team identifier column. Available: {temp_df.columns}")

    team_def_stats_df = temp_df[[identifier_column, 'DEF_RATING']].copy()
    
    if identifier_column in ['TEAM_NAME', 'TEAM_ID'] and 'teams_df' in locals():
         merge_left_col = 'TEAM_NAME' if identifier_column == 'TEAM_NAME' else 'TEAM_ID'
         merge_right_col = 'full_name' if identifier_column == 'TEAM_NAME' else 'id'
         
         if identifier_column == 'TEAM_ID':
             team_def_stats_df[identifier_column] = team_def_stats_df[identifier_column].astype(int)
             teams_df['id'] = teams_df['id'].astype(int)

         team_def_stats_df = pd.merge(team_def_stats_df, teams_df[['id', 'full_name', 'abbreviation']], left_on=merge_left_col, right_on=merge_right_col, how='left')
         
         if 'abbreviation' in team_def_stats_df.columns:
             team_def_stats_df = team_def_stats_df[['abbreviation', 'DEF_RATING']].rename(columns={'abbreviation': 'TEAM_ABBREVIATION'})
         else:
              print("Warning: Could not find 'abbreviation' after merging with teams_df.")
              team_def_stats_df = pd.DataFrame() 
              
    elif identifier_column == 'TEAM_ABBREVIATION':
         pass 
    else:
         print(f"Warning: Identifier column '{identifier_column}' might require manual handling for merging later.")


    if not team_def_stats_df.empty:
        print("\nTeam defensive stats processed.")
        print(team_def_stats_df.head())
    elif 'TEAM_ABBREVIATION' in temp_df.columns: 
         print("\nProcessing stats using TEAM_ABBREVIATION directly.")
         team_def_stats_df = temp_df[['TEAM_ABBREVIATION', 'DEF_RATING']].copy()
         print(team_def_stats_df.head())
    else:
         print("\nCould not process team defensive stats correctly.")

except Exception as e:
    # Catch specific SSL error if possible, otherwise general exception
    if 'CERTIFICATE_VERIFY_FAILED' in str(e):
         print(f"\nSSL Certificate Error fetching team stats: {e}")
         print("This might be due to a corporate network/proxy. Using fallback DEF_RATING.")
    else:
         print(f"\nError fetching or processing team defensive stats: {e}")
    team_def_stats_df = pd.DataFrame() # Ensure it's empty on error

Fetching team defensive stats...

SSL Certificate Error fetching team stats: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=&Location=&MeasureType=Defense&Month=0&OpponentTeamID=0&Outcome=&PORound=&PaceAdjust=N&PerMode=Totals&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2023-24&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=&TwoWay=&VsConference=&VsDivision= (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
This might be due to a corporate network/proxy. Using fallback DEF_RATING.


In [72]:
# Function to get game logs for a player and season with delay
def get_player_log(player_id, season='2023-24'):
    print(f"Fetching logs for player {player_id}...")
    try:
        # Note: PlayerGameLog endpoint provides FGA, PTS, FTA, and TOV
        log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        df = log.get_data_frames()[0]
        time.sleep(0.6) # NBA API rate limit
        return df
    except Exception as e:
        print(f"Error fetching logs for player {player_id}: {e}")
        time.sleep(0.6)
        return pd.DataFrame()

# --- Define Season and Output File ---
SEASON = '2023-24'
RAW_GAMELOG_FILE = f'nba_gamelogs_raw_{SEASON}.csv'
MIN_MINUTES_THRESHOLD = 15 # Minimum average minutes per game to be included
MAX_PLAYERS_TO_FETCH = 100 # Limit players for faster fetching if needed

# --- Check if Processed Data Exists and Contains TOV ---
FETCH_REQUIRED = False
if os.path.exists(RAW_GAMELOG_FILE):
    print(f"Loading existing raw game logs from {RAW_GAMELOG_FILE}...")
    try:
        all_gamelogs_df = pd.read_csv(RAW_GAMELOG_FILE)
        # Ensure Player_ID is integer if loaded from CSV
        if 'Player_ID' in all_gamelogs_df.columns:
            all_gamelogs_df['Player_ID'] = all_gamelogs_df['Player_ID'].astype(int)
        # Check if TOV column is present
        if 'TOV' not in all_gamelogs_df.columns:
            print("Warning: 'TOV' column missing from existing CSV. Re-fetching required.")
            FETCH_REQUIRED = True
        else:
            print("'TOV' column found in existing CSV.")
    except Exception as e:
        print(f"Error loading or checking CSV file {RAW_GAMELOG_FILE}: {e}. Re-fetching required.")
        FETCH_REQUIRED = True
        all_gamelogs_df = pd.DataFrame() # Ensure it's empty if loading failed
else:
    print("Raw game log file not found. Fetching data...")
    FETCH_REQUIRED = True
    all_gamelogs_df = pd.DataFrame()

# --- Fetch Data if Required ---
if FETCH_REQUIRED:
    # Remove old file if it exists but is incomplete
    if os.path.exists(RAW_GAMELOG_FILE):
        print(f"Removing incomplete file: {RAW_GAMELOG_FILE}")
        try:
            os.remove(RAW_GAMELOG_FILE)
        except OSError as e:
            print(f"Error removing file: {e}")
            
    # --- Filter Players Based on Season Stats (e.g., Minutes Played) ---
    print(f"Fetching player stats for {SEASON} to filter...")
    player_ids_to_fetch = []
    try:
        player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season=SEASON)
        player_stats_df = player_stats.get_data_frames()[0]
        time.sleep(0.6)
        
        # Filter players playing significant minutes
        relevant_players_df = player_stats_df[player_stats_df['MIN'] >= MIN_MINUTES_THRESHOLD]
        player_ids_to_fetch = relevant_players_df['PLAYER_ID'].unique().tolist()
        print(f"Found {len(player_ids_to_fetch)} players averaging >= {MIN_MINUTES_THRESHOLD} MPG.")
        
    except Exception as e:
        print(f"Error fetching player stats for filtering: {e}. Falling back to all active players.")
        # Fallback: Get all active players if stats fetch fails
        active_players_df = players_df[players_df['is_active'] == True]
        player_ids_to_fetch = active_players_df['id'].tolist()
        print(f"Fetching for all {len(player_ids_to_fetch)} active players (fallback).")
        
    # --- Limit players if needed ---
    if len(player_ids_to_fetch) > MAX_PLAYERS_TO_FETCH:
        print(f"Limiting fetch to {MAX_PLAYERS_TO_FETCH} players for speed.")
        player_ids_to_fetch = player_ids_to_fetch[:MAX_PLAYERS_TO_FETCH]
        
    # --- Fetching game logs for filtered players ---
    if player_ids_to_fetch:
        print(f"Fetching game logs for {len(player_ids_to_fetch)} players...")
        fetched_logs = [] # Collect dataframes in a list first
        for i, p_id in enumerate(player_ids_to_fetch):
            print(f"Progress: {i+1}/{len(player_ids_to_fetch)}")
            player_log_df = get_player_log(p_id, season=SEASON)
            if not player_log_df.empty:
                # Add Player_ID if it's missing (sometimes happens)
                if 'Player_ID' not in player_log_df.columns:
                     player_log_df['Player_ID'] = p_id
                fetched_logs.append(player_log_df)
                
        # --- Concatenate and Save the fetched data ---
        if fetched_logs:
            all_gamelogs_df = pd.concat(fetched_logs, ignore_index=True)
            print(f"\nSaving {len(all_gamelogs_df)} game logs to {RAW_GAMELOG_FILE}...")
            all_gamelogs_df.to_csv(RAW_GAMELOG_FILE, index=False)
            print("Save complete.")
        else:
            print("\nNo game logs were fetched or concatenated.")
            all_gamelogs_df = pd.DataFrame() # Ensure it's an empty DF if nothing was fetched
    else:
        print("\nNo player IDs identified for fetching.")
        all_gamelogs_df = pd.DataFrame()

# --- Display results ---
if not all_gamelogs_df.empty:
    print(f"\nTotal game logs available: {len(all_gamelogs_df)}")
    print(f"Unique players in logs: {all_gamelogs_df['Player_ID'].nunique()}")
    # Check essential columns after loading/fetching
    for col in ['FTA', 'TOV']:
        if col in all_gamelogs_df.columns:
            print(f"'{col}' column successfully included.")
        else:
            print(f"Warning: '{col}' column is missing from the loaded/fetched data!")
    print(all_gamelogs_df.head())
else:
    print("\nall_gamelogs_df is empty. Cannot proceed.")

Loading existing raw game logs from nba_gamelogs_raw_2023-24.csv...
'TOV' column found in existing CSV.

Total game logs available: 5193
Unique players in logs: 100
'FTA' column successfully included.
'TOV' column successfully included.
   SEASON_ID  Player_ID   Game_ID     GAME_DATE      MATCHUP WL  MIN  FGM  \
0      22023    1630639  22301196  APR 14, 2024    DAL @ OKC  L   24    5   
1      22023    1630639  22301181  APR 12, 2024  DAL vs. DET  L   22    1   
2      22023    1630639  22301161  APR 10, 2024    DAL @ MIA  W    2    0   
3      22023    1630639  22301144  APR 09, 2024    DAL @ CHA  W    4    1   
4      22023    1630639  22301097  APR 05, 2024  DAL vs. GSW  W    5    0   

   FGA  FG_PCT  ...  DREB  REB  AST  STL  BLK  TOV  PF  PTS  PLUS_MINUS  \
0   13   0.385  ...     4    5    2    0    0    1   1   12         -18   
1    6   0.167  ...     5    6    2    0    0    2   0    2          -4   
2    0   0.000  ...     2    2    1    0    0    0   0    0           4   


In [73]:
# --- Data Preprocessing ---

# Check if all_gamelogs_df exists and is not empty before proceeding
if 'all_gamelogs_df' in locals() and not all_gamelogs_df.empty:
    processed_df = all_gamelogs_df.copy()
    processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])

    # Select relevant columns (including FGA, FTA, and TOV)
    # Ensure all expected columns exist, handle missing ones if necessary
    expected_cols = ['Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 
                     'MIN', 'PTS', 'REB', 'AST', 'FG3M', 'STL', 'BLK', 'TOV', 'FGA', 'FTA'] # Added TOV
    available_cols = [col for col in expected_cols if col in processed_df.columns]
    missing_cols = [col for col in expected_cols if col not in processed_df.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns: {missing_cols}. Proceeding with available columns.")
    processed_df = processed_df[available_cols]

    # Ensure necessary columns for calculations are numeric, coercing errors
    for col in ['PTS', 'FGA', 'FTA', 'MIN', 'TOV']: # Added TOV
        if col in processed_df.columns:
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
        else:
             print(f"Warning: Column {col} needed for processing is missing.")
             
    # Drop rows where essential numeric columns became NaN after coercion
    # Important: Check if TOV exists before adding to subset
    dropna_subset = ['PTS', 'FGA', 'FTA', 'MIN']
    if 'TOV' in processed_df.columns:
        dropna_subset.append('TOV')
    processed_df.dropna(subset=dropna_subset, inplace=True)

    def parse_matchup(matchup_str):
        if pd.isna(matchup_str):
             return 'Unknown', 'Unknown'
        if '@' in matchup_str:
            parts = matchup_str.split(' @ ')
            opponent = parts[1]
            home_away = 'Away'
        elif 'vs.' in matchup_str:
            parts = matchup_str.split(' vs. ')
            opponent = parts[1]
            home_away = 'Home'
        else: 
            opponent = 'Unknown'
            home_away = 'Unknown'
        return opponent, home_away

    if 'MATCHUP' in processed_df.columns:
        processed_df[['Opponent', 'Home_Away']] = processed_df['MATCHUP'].apply(
            lambda x: pd.Series(parse_matchup(x))
        )
    else:
        print("Warning: 'MATCHUP' column not found. Cannot determine Opponent or Home/Away.")
        processed_df['Opponent'] = 'Unknown'
        processed_df['Home_Away'] = 'Unknown'

    processed_df = processed_df.sort_values(by=['Player_ID', 'GAME_DATE'])

    print("Data preprocessing complete.")
    print(processed_df.head())
else:
    print("Skipping Data Preprocessing because 'all_gamelogs_df' is not available or empty.")
    processed_df = pd.DataFrame() # Ensure processed_df exists even if empty

Data preprocessing complete.
      Player_ID   Game_ID  GAME_DATE      MATCHUP WL  MIN  PTS  REB  AST  \
4303     101108  22300062 2023-10-24  GSW vs. PHX  L   34   14    6    9   
4302     101108  22300087 2023-10-27    GSW @ SAC  W   33   10    2   12   
4301     101108  22300096 2023-10-29    GSW @ HOU  W   27    8    5    7   
4300     101108  22300108 2023-10-30    GSW @ NOP  W   25   13    6    5   
4299     101108  22300126 2023-11-01  GSW vs. SAC  W   28    2    4    8   

      FG3M  STL  BLK  TOV  FGA  FTA Opponent Home_Away  
4303     0    2    0    1   15    7      PHX      Home  
4302     0    3    0    3   12    0      SAC      Away  
4301     0    1    0    1    8    2      HOU      Away  
4300     1    2    0    1   10    0      NOP      Away  
4299     0    0    0    0    5    0      SAC      Home  


  processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])


In [74]:
# --- Feature Engineering ---

# Check if processed_df exists and is not empty
if 'processed_df' in locals() and not processed_df.empty:
    print("Starting Feature Engineering...")
    
    # Calculate Rest Days
    processed_df['Rest_Days'] = processed_df.groupby('Player_ID')['GAME_DATE'].diff().dt.days
    # Fill first game NaN with a reasonable value (e.g., average rest or a specific indicator)
    processed_df['Rest_Days'].fillna(2, inplace=True) # Assuming 2 days rest for the first game as a default
    print("Calculated Rest Days.")

    # Calculate Player Usage Rate Proxy (Requires TOV)
    if 'FGA' in processed_df.columns and 'FTA' in processed_df.columns and 'TOV' in processed_df.columns and 'MIN' in processed_df.columns:
        # Ensure TOV is numeric (should be from preprocessing, but double-check)
        processed_df['TOV'] = pd.to_numeric(processed_df['TOV'], errors='coerce').fillna(0)
        
        usg_numerator = processed_df['FGA'] + 0.44 * processed_df['FTA'] + processed_df['TOV']
        usg_denominator = processed_df['MIN']
        processed_df['Player_USG_Proxy'] = np.where(usg_denominator == 0, 0, usg_numerator / usg_denominator)
        processed_df['Player_USG_Proxy'].fillna(0, inplace=True)
        processed_df['Player_USG_Proxy'].replace([np.inf, -np.inf], 0, inplace=True)
        print("Calculated Player Usage Rate Proxy.")
    else:
        print("Warning: Could not calculate Player Usage Rate Proxy due to missing FGA, FTA, TOV, or MIN columns.")
        processed_df['Player_USG_Proxy'] = 0 # Assign default value
        
    # Calculate True Shooting Percentage (TS%)
    if 'PTS' in processed_df.columns and 'FGA' in processed_df.columns and 'FTA' in processed_df.columns:
        denominator = 2 * (processed_df['FGA'] + 0.44 * processed_df['FTA'])
        processed_df['TS%'] = np.where(denominator == 0, 0, processed_df['PTS'] / denominator)
        processed_df['TS%'].fillna(0, inplace=True)
        processed_df['TS%'].replace([np.inf, -np.inf], 0, inplace=True)
        print("Calculated TS%.")
    else:
        print("Warning: Could not calculate TS% due to missing PTS, FGA, or FTA columns.")
        processed_df['TS%'] = 0 # Assign default value

    # Rolling Averages (Include new features)
    cols_for_rolling = ['PTS', 'MIN', 'FGA', 'FTA', 'TS%', 'Player_USG_Proxy'] # Added Player_USG_Proxy
    for col in cols_for_rolling:
        if col in processed_df.columns:
            # Ensure column is numeric before rolling calculation
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
            processed_df[col].fillna(0, inplace=True) # Fill NaNs introduced by coercion
            
            processed_df[f'{col}_Roll_3'] = processed_df.groupby('Player_ID')[col].transform(
                lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
            )
            processed_df[f'{col}_Roll_5'] = processed_df.groupby('Player_ID')[col].transform(
                lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
            )
        else:
            print(f"Warning: Column '{col}' not found for rolling average calculation.")

    # Cumulative Season Averages (Shifted)
    # Check required columns exist (including TOV for USG% calculation)
    cum_avg_req_cols = ['PTS', 'MIN', 'TS%', 'FGA', 'FTA', 'TOV']
    if all(col in processed_df.columns for col in cum_avg_req_cols):
        processed_df['Cum_PTS'] = processed_df.groupby('Player_ID')['PTS'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_MIN'] = processed_df.groupby('Player_ID')['MIN'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_Games'] = processed_df.groupby('Player_ID').cumcount() 
        processed_df['Cum_FGA'] = processed_df.groupby('Player_ID')['FGA'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_FTA'] = processed_df.groupby('Player_ID')['FTA'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_TOV'] = processed_df.groupby('Player_ID')['TOV'].transform(lambda x: x.expanding().sum().shift(1))

        processed_df['Avg_PTS_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_Games']).replace([np.inf, -np.inf, np.nan], 0)
        processed_df['PTS_Per36_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_MIN'] * 36).replace([np.inf, -np.inf, np.nan], 0)
        
        # Cumulative TS% calculation
        cum_ts_denominator = 2 * (processed_df['Cum_FGA'] + 0.44 * processed_df['Cum_FTA'])
        processed_df['Avg_TS%_Season'] = np.where(cum_ts_denominator == 0, 0, processed_df['Cum_PTS'] / cum_ts_denominator)
        processed_df['Avg_TS%_Season'].fillna(0, inplace=True)
        processed_df['Avg_TS%_Season'].replace([np.inf, -np.inf], 0, inplace=True)
        
        # Cumulative USG% Proxy calculation
        cum_usg_numerator = processed_df['Cum_FGA'] + 0.44 * processed_df['Cum_FTA'] + processed_df['Cum_TOV']
        cum_usg_denominator = processed_df['Cum_MIN']
        processed_df['Avg_USG%_Proxy_Season'] = np.where(cum_usg_denominator == 0, 0, cum_usg_numerator / cum_usg_denominator)
        processed_df['Avg_USG%_Proxy_Season'].fillna(0, inplace=True)
        processed_df['Avg_USG%_Proxy_Season'].replace([np.inf, -np.inf], 0, inplace=True)
        
        print("Calculated cumulative averages including TS% and USG% Proxy.")
    else:
         missing_cum_cols = [col for col in cum_avg_req_cols if col not in processed_df.columns]
         print(f"Warning: One or more columns ({missing_cum_cols}) missing for cumulative average calculation.")

    # Other Features
    if 'Home_Away' in processed_df.columns:
        processed_df['Is_Home'] = processed_df['Home_Away'].apply(lambda x: 1 if x == 'Home' else 0)
    else:
        processed_df['Is_Home'] = 0 # Default if Home_Away is missing

    # Merge Opponent Stats
    # Use fallback DEF_RATING if team_def_stats_df is empty (due to fetch error)
    DEFAULT_DEF_RATING = 115.0 
    if 'team_def_stats_df' in locals() and not team_def_stats_df.empty and 'TEAM_ABBREVIATION' in team_def_stats_df.columns and 'Opponent' in processed_df.columns:
        print("Merging fetched team defensive stats...")
        team_def_stats_to_merge = team_def_stats_df.rename(columns={
            'TEAM_ABBREVIATION': 'Opponent',
            'DEF_RATING': 'Opponent_DEF_RATING'
        })
        try:
            processed_df['Opponent'] = processed_df['Opponent'].astype(team_def_stats_to_merge['Opponent'].dtype)
        except Exception as e:
            print(f"Warning: Could not align Opponent column types for merge: {e}")
            
        processed_df = pd.merge(processed_df, team_def_stats_to_merge[['Opponent', 'Opponent_DEF_RATING']], on='Opponent', how='left')
        
        if 'Opponent_DEF_RATING' in processed_df.columns:
             processed_df['Opponent_DEF_RATING'] = pd.to_numeric(processed_df['Opponent_DEF_RATING'], errors='coerce')
             # Fill NaNs with the mean *of the successfully merged ratings* if possible, otherwise use default
             avg_def_rating = processed_df['Opponent_DEF_RATING'].mean() 
             fill_value = avg_def_rating if not pd.isna(avg_def_rating) else DEFAULT_DEF_RATING
             processed_df['Opponent_DEF_RATING'].fillna(fill_value, inplace=True)
             print(f"Opponent defensive stats merged. Filled NaNs with {fill_value:.1f}.")
        else:
             print("Warning: 'Opponent_DEF_RATING' column not created after merge. Using default.")
             processed_df['Opponent_DEF_RATING'] = DEFAULT_DEF_RATING
    else:
        print(f"Warning: Team defensive stats not available or Opponent column missing. Using default DEF_RATING: {DEFAULT_DEF_RATING}")
        processed_df['Opponent_DEF_RATING'] = DEFAULT_DEF_RATING

    # Final Fill NA for engineered features
    # Identify all potential feature columns created
    feature_cols = [col for col in processed_df.columns if '_Roll_' in col or '_Season' in col or col == 'Is_Home' or col == 'Opponent_DEF_RATING' or col == 'Rest_Days']
    processed_df[feature_cols] = processed_df[feature_cols].fillna(0)
    print("Feature Engineering complete.")
    print(processed_df.head(10))
else:
    print("Skipping Feature Engineering because 'processed_df' is not available or empty.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Rest_Days'].fillna(2, inplace=True) # Assuming 2 days rest for the first game as a default
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Player_USG_Proxy'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never

Starting Feature Engineering...
Calculated Rest Days.
Calculated Player Usage Rate Proxy.
Calculated TS%.
Calculated cumulative averages including TS% and USG% Proxy.
Feature Engineering complete.
      Player_ID   Game_ID  GAME_DATE      MATCHUP WL  MIN  PTS  REB  AST  \
4303     101108  22300062 2023-10-24  GSW vs. PHX  L   34   14    6    9   
4302     101108  22300087 2023-10-27    GSW @ SAC  W   33   10    2   12   
4301     101108  22300096 2023-10-29    GSW @ HOU  W   27    8    5    7   
4300     101108  22300108 2023-10-30    GSW @ NOP  W   25   13    6    5   
4299     101108  22300126 2023-11-01  GSW vs. SAC  W   28    2    4    8   
4298     101108  22300005 2023-11-03    GSW @ OKC  W   28    1    2   13   
4297     101108  22300142 2023-11-05    GSW @ CLE  L   25    5    1    2   
4296     101108  22300145 2023-11-06    GSW @ DET  W   21   17    5    6   
4295     101108  22300169 2023-11-08    GSW @ DEN  L   27    9    5    4   
4294     101108  22300176 2023-11-11  GSW v

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['TS%'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['TS%'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

In [75]:
# --- Prepare Data for Modeling ---

# Check if processed_df exists and is not empty
if 'processed_df' in locals() and not processed_df.empty:
    print("Preparing data for modeling...")
    # Define required columns based on features actually created
    required_cols = []
    # Add base features
    base_features = ['PTS_Roll_3', 'PTS_Roll_5', 'MIN_Roll_3', 'MIN_Roll_5', 
                     'FGA_Roll_3', 'FGA_Roll_5', 'Avg_PTS_Season', 'PTS_Per36_Season',
                     'Opponent_DEF_RATING']
    for col in base_features:
        if col in processed_df.columns:
            required_cols.append(col)
        else:
            print(f"Note: Feature '{col}' not found in processed_df, excluding from model requirements.")
            
    # Add TS% features
    ts_features = ['TS%_Roll_3', 'TS%_Roll_5', 'Avg_TS%_Season']
    for col in ts_features:
         if col in processed_df.columns:
            required_cols.append(col)
         else:
            print(f"Note: TS% Feature '{col}' not found in processed_df, excluding from model requirements.")
            
    # Add new USG% and Rest features
    new_features = ['Player_USG_Proxy_Roll_3', 'Player_USG_Proxy_Roll_5', 'Avg_USG%_Proxy_Season', 'Rest_Days']
    for col in new_features:
        if col in processed_df.columns:
            required_cols.append(col)
        else:
            print(f"Note: New Feature '{col}' not found in processed_df, excluding from model requirements.")
            
    # Add target variable if it exists
    target = 'PTS'
    if target not in processed_df.columns:
        print(f"Error: Target variable '{target}' not found in processed_df. Cannot proceed with modeling.")
        model_df = pd.DataFrame()
        X_train, X_test, y_train, y_test = [None]*4
    else:
        # Drop rows where target or any required feature is missing BEFORE creating model_df
        # Also drop rows with insufficient history for rolling features (implicitly handled by fillna(0) then dropna)
        model_df = processed_df.dropna(subset=[target] + required_cols).copy()

        # Convert Opponent_DEF_RATING to numeric if it exists and wasn't already
        if 'Opponent_DEF_RATING' in model_df.columns:
            model_df['Opponent_DEF_RATING'] = pd.to_numeric(model_df['Opponent_DEF_RATING'], errors='coerce')
            if model_df['Opponent_DEF_RATING'].isnull().any():
                mean_def_rating = model_df['Opponent_DEF_RATING'].mean()
                print(f"Filling NaN Opponent_DEF_RATING with mean: {mean_def_rating}")
                model_df['Opponent_DEF_RATING'].fillna(mean_def_rating, inplace=True)
        
        # Check if enough data remains
        if model_df.empty or len(model_df) < 10: # Arbitrary threshold for minimum data
            print("Not enough data with required features and target to build a model.")
            X_train, X_test, y_train, y_test = [None]*4 
        else:
            # Define features based on columns actually present in model_df
            features = required_cols + ['Is_Home'] # Add Is_Home if it exists
            features = [f for f in features if f in model_df.columns] # Ensure all features exist
            
            print(f"Using features: {features}")
            X = model_df[features]
            y = model_df[target]
            
            # Check for NaN/inf in features or target before split
            if X.isnull().values.any() or y.isnull().values.any() or np.isinf(X.values).any() or np.isinf(y.values).any():
                 print("Warning: NaN or Inf values detected in features or target before train/test split. Attempting to fill with 0.")
                 X = X.fillna(0)
                 y = y.fillna(0)
                 X = X.replace([np.inf, -np.inf], 0)
                 y = y.replace([np.inf, -np.inf], 0)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
            
            print(f"Data prepared for modeling. Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")
else:
    print("Skipping Data Preparation for Modeling because 'processed_df' is not available or empty.")
    X_train, X_test, y_train, y_test = [None]*4

Preparing data for modeling...
Using features: ['PTS_Roll_3', 'PTS_Roll_5', 'MIN_Roll_3', 'MIN_Roll_5', 'FGA_Roll_3', 'FGA_Roll_5', 'Avg_PTS_Season', 'PTS_Per36_Season', 'Opponent_DEF_RATING', 'TS%_Roll_3', 'TS%_Roll_5', 'Avg_TS%_Season', 'Player_USG_Proxy_Roll_3', 'Player_USG_Proxy_Roll_5', 'Avg_USG%_Proxy_Season', 'Rest_Days', 'Is_Home']
Data prepared for modeling. Training set size: 4154, Testing set size: 1039


In [76]:
# --- Model Evaluation Function ---

def evaluate_model(model, X_test, y_test, model_name, X_train_cols=None):
    """Calculates and prints evaluation metrics for a given model."""
    print(f"\n--- {model_name} Evaluation ---")
    
    if model is None or X_test is None or y_test is None or X_test.empty:
        print(f"Skipping {model_name} evaluation as model was not trained or test data was missing.")
        return
        
    try:
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred) 
        rmse = np.sqrt(mse) 

        print(f"Mean Absolute Error (MAE): {mae:.2f}")
        print(f"Root Mean Squared Error (RMSE): {rmse:.2f}") 

        # --- Basic Error Analysis ---
        X_test_results = X_test.copy()
        X_test_results['Actual_PTS'] = y_test
        X_test_results['Predicted_PTS'] = y_pred
        X_test_results['Error'] = X_test_results['Actual_PTS'] - X_test_results['Predicted_PTS']
        X_test_results['Abs_Error'] = np.abs(X_test_results['Error'])
        
        print(f"\nSample Predictions vs Actual ({model_name}):")
        print(X_test_results.head())
        
        print("\nLargest Errors (Top 5):")
        print(X_test_results.sort_values(by='Abs_Error', ascending=False).head())
        
        # Optional: Feature Importances (Specific to tree-based models like XGBoost)
        if hasattr(model, 'feature_importances_') and X_train_cols is not None:
            print(f"\nFeature Importances ({model_name}):")
            importances = pd.DataFrame({
                'Feature': X_train_cols,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            print(importances)

    except Exception as e:
        print(f"Error during {model_name} evaluation: {e}")

In [77]:
# --- Model Building & Tuning (Ridge Regression) ---
model_ridge = None
if X_train is not None and y_train is not None and not X_train.empty:
    print("Tuning Ridge Regression Model...")
    
    param_grid_ridge = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]}
    
    ridge_estimator = Ridge()
    grid_search_ridge = GridSearchCV(ridge_estimator, param_grid_ridge, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    
    try:
        grid_search_ridge.fit(X_train, y_train)
        model_ridge = grid_search_ridge.best_estimator_
        print(f"Best alpha found for Ridge: {grid_search_ridge.best_params_['alpha']}")
        print("Tuned Ridge Regression Model training complete.")
    except Exception as e:
        print(f"Error during Ridge GridSearchCV: {e}")
        model_ridge = None # Ensure model is None if fitting fails
else:
    print("Skipping Ridge tuning due to lack of training data.")

Tuning Ridge Regression Model...
Best alpha found for Ridge: 10.0
Tuned Ridge Regression Model training complete.


In [78]:
# --- Model Evaluation (Tuned Ridge Regression) ---
evaluate_model(model_ridge, X_test, y_test, "Tuned Ridge")


--- Tuned Ridge Evaluation ---
Mean Absolute Error (MAE): 4.50
Root Mean Squared Error (RMSE): 5.91

Sample Predictions vs Actual (Tuned Ridge):
      PTS_Roll_3  PTS_Roll_5  MIN_Roll_3  MIN_Roll_5  FGA_Roll_3  FGA_Roll_5  \
3518   21.333333        19.8   28.666667        27.8   15.000000        15.0   
494     7.666667         9.0   24.333333        27.4    5.000000         5.4   
2834    3.333333         3.0    5.666667         4.8    3.333333         2.8   
2708   19.333333        15.6   34.333333        29.4   16.666667        14.4   
4861    2.666667         1.6    4.333333         3.6    1.333333         1.0   

      Avg_PTS_Season  PTS_Per36_Season  Opponent_DEF_RATING  TS%_Roll_3  ...  \
3518       20.593750         25.871320                115.0    0.613634  ...   
494         7.380952         10.127042                115.0    0.679239  ...   
2834        2.142857         19.285714                115.0    0.339793  ...   
2708       16.750000         18.843750               

In [79]:
# --- Model Building & Tuning (XGBoost) ---
model_xgb_tuned = None
if X_train is not None and y_train is not None and not X_train.empty:
    print("\n--- Tuning XGBoost Model ---")
    
    # Define an expanded parameter grid for more thorough tuning
    param_grid_xgb = {
        'n_estimators': [100, 200, 300],       # More trees
        'max_depth': [3, 5, 7],                # Deeper trees
        'learning_rate': [0.01, 0.05, 0.1],    # Smaller learning rates
        'subsample': [0.7, 0.9, 1.0],          # Vary sample fraction
        'colsample_bytree': [0.7, 0.9, 1.0],   # Vary feature fraction
        'reg_alpha': [0, 0.1, 0.5],            # L1 Regularization
        'reg_lambda': [0.5, 1.0, 1.5]          # L2 Regularization
    }
    
    xgb_estimator = XGBRegressor(random_state=42, objective='reg:squarederror') 
    
    # Use GridSearchCV with more folds
    grid_search_xgb = GridSearchCV(
        estimator=xgb_estimator,
        param_grid=param_grid_xgb,
        scoring='neg_mean_absolute_error', # Optimize for MAE
        cv=5, # Use 5-fold CV for more robust evaluation
        n_jobs=-1, # Use all available CPU cores
        verbose=1 # Print progress
    )
    
    try:
        print("Starting XGBoost GridSearchCV (this may take a while)...")
        grid_search_xgb.fit(X_train, y_train)
        model_xgb_tuned = grid_search_xgb.best_estimator_
        print(f"\nBest parameters found for XGBoost: {grid_search_xgb.best_params_}")
        print(f"Best MAE score during CV: {-grid_search_xgb.best_score_:.2f}")
        print("Tuned XGBoost Model training complete.")
    except Exception as e:
        print(f"Error during XGBoost GridSearchCV: {e}")
        model_xgb_tuned = None # Ensure model is None if fitting fails
else:
    print("Skipping XGBoost tuning due to lack of training data.")


--- Tuning XGBoost Model ---
Starting XGBoost GridSearchCV (this may take a while)...
Fitting 5 folds for each of 2187 candidates, totalling 10935 fits

Best parameters found for XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.9}
Best MAE score during CV: 4.66
Tuned XGBoost Model training complete.


In [80]:
# --- Model Evaluation (Tuned XGBoost) ---
# Pass X_train.columns for feature importance display
train_cols = X_train.columns if X_train is not None else None
evaluate_model(model_xgb_tuned, X_test, y_test, "Tuned XGBoost", X_train_cols=train_cols)


--- Tuned XGBoost Evaluation ---
Mean Absolute Error (MAE): 4.55
Root Mean Squared Error (RMSE): 5.97

Sample Predictions vs Actual (Tuned XGBoost):
      PTS_Roll_3  PTS_Roll_5  MIN_Roll_3  MIN_Roll_5  FGA_Roll_3  FGA_Roll_5  \
3518   21.333333        19.8   28.666667        27.8   15.000000        15.0   
494     7.666667         9.0   24.333333        27.4    5.000000         5.4   
2834    3.333333         3.0    5.666667         4.8    3.333333         2.8   
2708   19.333333        15.6   34.333333        29.4   16.666667        14.4   
4861    2.666667         1.6    4.333333         3.6    1.333333         1.0   

      Avg_PTS_Season  PTS_Per36_Season  Opponent_DEF_RATING  TS%_Roll_3  ...  \
3518       20.593750         25.871320                115.0    0.613634  ...   
494         7.380952         10.127042                115.0    0.679239  ...   
2834        2.142857         19.285714                115.0    0.339793  ...   
2708       16.750000         18.843750           