In [2]:
# Import libraries
import pandas as pd
import os
from nba_api.stats.endpoints import playergamelog, commonplayerinfo, leaguegamefinder, leaguedashteamstats, leaguedashplayerstats
from nba_api.stats.static import players, teams
import time 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit # Added TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error # Added MAPE
from xgboost import XGBRegressor
import lightgbm as lgb # Added LightGBM
import joblib # Added for saving models
import traceback # Added for detailed error printing

In [3]:
# Get all players
nba_players = players.get_players()
players_df = pd.DataFrame(nba_players)
print(f"Total players found: {len(players_df)}")
# players_df.head() # Optionally display head

Total players found: 5024


In [4]:
# Get all teams
nba_teams = teams.get_teams()
teams_df = pd.DataFrame(nba_teams)
print(f"Total teams found: {len(teams_df)}")
# teams_df.head() # Optionally display head

Total teams found: 30


In [5]:
# --- Define Season --- 
SEASON = '2024-25' # <<< CORRECTED SEASON HERE
print(f"Processing data for SEASON: {SEASON}")

# --- Create directories for outputs ---
MODEL_DIR = f'models/{SEASON}'
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"Model output directory: {MODEL_DIR}")

# --- Fetch Team Defensive Stats and Pace for the season ---
print(f"Fetching team stats (Defense & Pace) for {SEASON}...")
team_stats_df = pd.DataFrame() 
try:
    # Fetch stats, including Pace
    team_stats = leaguedashteamstats.LeagueDashTeamStats(
        season=SEASON, # Uses the SEASON variable
        measure_type_detailed_defense='Defense' # This measure type includes Pace in the result
    )
    temp_df = team_stats.get_data_frames()[0]
    time.sleep(0.6) # Add delay after API call
    
    print("Available columns in team stats:")
    print(temp_df.columns) 

    # Identify the team column (prefer TEAM_ABBREVIATION for merging later)
    identifier_column = None
    if 'TEAM_ABBREVIATION' in temp_df.columns:
        identifier_column = 'TEAM_ABBREVIATION'
        print(f"Using '{identifier_column}' as identifier.")
    elif 'TEAM_NAME' in temp_df.columns:
        identifier_column = 'TEAM_NAME'
        print(f"Using '{identifier_column}' as identifier.")
    elif 'TEAM_ID' in temp_df.columns:
        identifier_column = 'TEAM_ID'
        print(f"Using '{identifier_column}' as identifier.")
    else:
        raise KeyError(f"Could not find a suitable team identifier column. Available: {temp_df.columns}")

    # Select relevant columns (identifier, DEF_RATING, PACE)
    required_team_cols = [identifier_column, 'DEF_RATING', 'PACE']
    available_team_cols = [col for col in required_team_cols if col in temp_df.columns]
    
    if len(available_team_cols) < 2: # Need at least identifier and one stat
         print(f"Warning: Not enough required team columns found. Found: {available_team_cols}")
         team_stats_df = pd.DataFrame() # Ensure it's empty
    else:
        team_stats_df = temp_df[available_team_cols].copy()
        
        # If identifier is not abbreviation, try to merge to get abbreviation
        if identifier_column != 'TEAM_ABBREVIATION' and 'teams_df' in locals() and not teams_df.empty:
             print(f"Merging team stats to get TEAM_ABBREVIATION using '{identifier_column}'...")
             merge_left_col = identifier_column
             merge_right_col = 'full_name' if identifier_column == 'TEAM_NAME' else 'id'
             
             if identifier_column == 'TEAM_ID':
                 team_stats_df[identifier_column] = team_stats_df[identifier_column].astype(int)
                 teams_df['id'] = teams_df['id'].astype(int)

             merged_temp_df = pd.merge(team_stats_df, teams_df[['id', 'full_name', 'abbreviation']], left_on=merge_left_col, right_on=merge_right_col, how='left')
             
             if 'abbreviation' in merged_temp_df.columns:
                 # Keep abbreviation and the stats, drop the original identifier
                 cols_to_keep = ['abbreviation'] + [col for col in available_team_cols if col != identifier_column]
                 team_stats_df = merged_temp_df[cols_to_keep].rename(columns={'abbreviation': 'TEAM_ABBREVIATION'})
                 print("Merged successfully, using TEAM_ABBREVIATION.")
             else:
                  print(f"Warning: Could not find 'abbreviation' after merging with teams_df based on '{identifier_column}'. Using original identifier column.")
                  team_stats_df = team_stats_df.rename(columns={identifier_column: 'TEAM_ABBREVIATION'}) # Rename for consistency if we couldn't merge
        else: # If identifier was already abbreviation or teams_df not available
             team_stats_df = team_stats_df.rename(columns={identifier_column: 'TEAM_ABBREVIATION'})

    if not team_stats_df.empty and 'TEAM_ABBREVIATION' in team_stats_df.columns:
        print("\nTeam stats (Defense & Pace) processed.")
        print(team_stats_df.head())
    else:
         print("\nCould not process team stats correctly after renaming/merging.")
         team_stats_df = pd.DataFrame() # Ensure empty on failure

except Exception as e:
    print(f"\nError fetching or processing team stats: {e}")
    traceback.print_exc()
    team_stats_df = pd.DataFrame() # Ensure it's empty on error

Processing data for SEASON: 2024-25
Model output directory: models/2024-25
Fetching team stats (Defense & Pace) for 2024-25...
Available columns in team stats:
Index(['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'DEF_RATING',
       'DREB', 'DREB_PCT', 'STL', 'BLK', 'OPP_PTS_OFF_TOV',
       'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'DEF_RATING_RANK',
       'DREB_RANK', 'DREB_PCT_RANK', 'STL_RANK', 'BLK_RANK',
       'OPP_PTS_OFF_TOV_RANK', 'OPP_PTS_2ND_CHANCE_RANK', 'OPP_PTS_FB_RANK',
       'OPP_PTS_PAINT_RANK'],
      dtype='object')
Using 'TEAM_NAME' as identifier.
Merging team stats to get TEAM_ABBREVIATION using 'TEAM_NAME'...
Merged successfully, using TEAM_ABBREVIATION.

Team stats (Defense & Pace) processed.
  TEAM_ABBREVIATION  DEF_RATING
0               ATL       114.8
1               BOS       110.1
2               BKN       115.4
3               CHA       115.7
4               CHI       

In [6]:
# Function to get game logs for a player and season with delay
def get_player_log(player_id, season='2024-25'): #<<< DEFAULT SEASON CORRECTED HERE
    print(f"Fetching logs for player {player_id} for season {season}...")
    try:
        # Note: PlayerGameLog endpoint provides FGA, PTS, FTA, and TOV
        log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        df = log.get_data_frames()[0]
        time.sleep(0.6) # NBA API rate limit
        return df
    except Exception as e:
        print(f"Error fetching logs for player {player_id} (season {season}): {e}")
        time.sleep(0.6)
        return pd.DataFrame()

# --- Define Season-Specific Output File ---
# SEASON variable is defined in the previous cell
RAW_GAMELOG_FILE = f'nba_gamelogs_raw_{SEASON}.csv' #<<< FILENAME NOW INCLUDES CORRECT SEASON
MIN_MINUTES_THRESHOLD = 15 # Minimum average minutes per game to be included
MAX_PLAYERS_TO_FETCH = 150 # Limit players for faster fetching if needed - Increased slightly

# --- Check if Processed Data Exists for THIS Season ---
FETCH_REQUIRED = False
if os.path.exists(RAW_GAMELOG_FILE):
    print(f"Loading existing raw game logs from {RAW_GAMELOG_FILE}...")
    try:
        all_gamelogs_df = pd.read_csv(RAW_GAMELOG_FILE)
        # Ensure Player_ID is integer if loaded from CSV
        if 'Player_ID' in all_gamelogs_df.columns:
            all_gamelogs_df['Player_ID'] = all_gamelogs_df['Player_ID'].astype(int)
        # Check if essential columns are present (including TOV, FGA, FTA)
        essential_cols_check = ['TOV', 'FGA', 'FTA', 'PTS', 'MIN']
        missing_essential = [col for col in essential_cols_check if col not in all_gamelogs_df.columns]
        if missing_essential:
            print(f"Warning: Missing essential columns ({missing_essential}) from existing CSV '{RAW_GAMELOG_FILE}'. Re-fetching required.")
            FETCH_REQUIRED = True
        else:
            print(f"Essential columns found in existing CSV '{RAW_GAMELOG_FILE}'.")
    except Exception as e:
        print(f"Error loading or checking CSV file {RAW_GAMELOG_FILE}: {e}. Re-fetching required.")
        FETCH_REQUIRED = True
        all_gamelogs_df = pd.DataFrame() # Ensure it's empty if loading failed
else:
    print(f"Raw game log file '{RAW_GAMELOG_FILE}' not found. Fetching data...")
    FETCH_REQUIRED = True
    all_gamelogs_df = pd.DataFrame()

# --- Fetch Data if Required ---
if FETCH_REQUIRED:
    # Remove old file if it exists but is incomplete
    if os.path.exists(RAW_GAMELOG_FILE):
        print(f"Removing potentially incomplete file: {RAW_GAMELOG_FILE}")
        try:
            os.remove(RAW_GAMELOG_FILE)
        except OSError as e:
            print(f"Error removing file: {e}")
            
    # --- Filter Players Based on Season Stats (e.g., Minutes Played) ---
    print(f"Fetching player stats for {SEASON} to filter...")
    player_ids_to_fetch = []
    try:
        player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season=SEASON) # Uses SEASON variable
        player_stats_df = player_stats.get_data_frames()[0]
        time.sleep(0.6)
        
        # Filter players playing significant minutes
        # Ensure MIN is numeric before filtering
        player_stats_df['MIN'] = pd.to_numeric(player_stats_df['MIN'], errors='coerce')
        relevant_players_df = player_stats_df[player_stats_df['MIN'] >= MIN_MINUTES_THRESHOLD].copy()
        player_ids_to_fetch = relevant_players_df['PLAYER_ID'].unique().tolist()
        print(f"Found {len(player_ids_to_fetch)} players averaging >= {MIN_MINUTES_THRESHOLD} MPG for {SEASON}.")
        
    except Exception as e:
        print(f"Error fetching player stats for {SEASON} filtering: {e}. Falling back to all active players.")
        traceback.print_exc()
        # Fallback: Get all active players if stats fetch fails
        active_players_df = players_df[players_df['is_active'] == True]
        player_ids_to_fetch = active_players_df['id'].tolist()
        print(f"Fetching for all {len(player_ids_to_fetch)} currently active players (fallback).")
        
    # --- Limit players if needed ---
    if len(player_ids_to_fetch) > MAX_PLAYERS_TO_FETCH:
        print(f"Limiting fetch to {MAX_PLAYERS_TO_FETCH} players for speed.")
        player_ids_to_fetch = player_ids_to_fetch[:MAX_PLAYERS_TO_FETCH]
        
    # --- Fetching game logs for filtered players ---
    if player_ids_to_fetch:
        print(f"Fetching game logs for {len(player_ids_to_fetch)} players for season {SEASON}...")
        fetched_logs = [] # Collect dataframes in a list first
        for i, p_id in enumerate(player_ids_to_fetch):
            print(f"Progress: {i+1}/{len(player_ids_to_fetch)}")
            player_log_df = get_player_log(p_id, season=SEASON) # <<< PASS SEASON HERE
            if not player_log_df.empty:
                # Add Player_ID if it's missing (sometimes happens)
                if 'Player_ID' not in player_log_df.columns:
                     player_log_df['Player_ID'] = p_id
                fetched_logs.append(player_log_df)
                
        # --- Concatenate and Save the fetched data ---
        if fetched_logs:
            all_gamelogs_df = pd.concat(fetched_logs, ignore_index=True)
            print(f"\nSaving {len(all_gamelogs_df)} game logs to {RAW_GAMELOG_FILE}...")
            all_gamelogs_df.to_csv(RAW_GAMELOG_FILE, index=False)
            print("Save complete.")
        else:
            print("\nNo game logs were fetched or concatenated.")
            all_gamelogs_df = pd.DataFrame() # Ensure it's an empty DF if nothing was fetched
    else:
        print("\nNo player IDs identified for fetching.")
        all_gamelogs_df = pd.DataFrame()

# --- Display results ---
if 'all_gamelogs_df' in locals() and not all_gamelogs_df.empty:
    print(f"\nTotal game logs available for {SEASON}: {len(all_gamelogs_df)}")
    print(f"Unique players in logs: {all_gamelogs_df['Player_ID'].nunique()}")
    # Check essential columns after loading/fetching
    essential_cols_check = ['FTA', 'TOV', 'FGA', 'PTS', 'MIN', 'GAME_DATE', 'MATCHUP']
    for col in essential_cols_check:
        if col in all_gamelogs_df.columns:
            print(f"'{col}' column successfully included.")
        else:
            print(f"Warning: '{col}' column is missing from the loaded/fetched data!")
    print(all_gamelogs_df.head())
else:
    print(f"\nall_gamelogs_df is empty for season {SEASON}. Cannot proceed with preprocessing/modeling.")

Raw game log file 'nba_gamelogs_raw_2024-25.csv' not found. Fetching data...
Fetching player stats for 2024-25 to filter...
Found 553 players averaging >= 15 MPG for 2024-25.
Limiting fetch to 150 players for speed.
Fetching game logs for 150 players for season 2024-25...
Progress: 1/150
Fetching logs for player 1630639 for season 2024-25...
Progress: 2/150
Fetching logs for player 1631260 for season 2024-25...
Progress: 3/150
Fetching logs for player 1642358 for season 2024-25...
Progress: 4/150
Fetching logs for player 203932 for season 2024-25...
Progress: 5/150
Fetching logs for player 1628988 for season 2024-25...
Progress: 6/150
Fetching logs for player 1630174 for season 2024-25...
Progress: 7/150
Fetching logs for player 1630598 for season 2024-25...
Progress: 8/150
Fetching logs for player 1641745 for season 2024-25...
Progress: 9/150
Fetching logs for player 1641766 for season 2024-25...
Progress: 10/150
Fetching logs for player 1641737 for season 2024-25...
Progress: 11/150


In [7]:
# --- Data Preprocessing ---

# Check if all_gamelogs_df exists and is not empty before proceeding
if 'all_gamelogs_df' in locals() and not all_gamelogs_df.empty:
    processed_df = all_gamelogs_df.copy()
    
    # Ensure GAME_DATE is datetime and sort for time-based features
    if 'GAME_DATE' in processed_df.columns:
        processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])
    else:
        print("Error: 'GAME_DATE' column missing! Cannot proceed with time-based processing.")
        processed_df = pd.DataFrame() # Empty the df to prevent further errors

    if not processed_df.empty:
        processed_df = processed_df.sort_values(by=['Player_ID', 'GAME_DATE'])

        # Select relevant columns (including FGA, FTA, and TOV)
        # Ensure all expected columns exist, handle missing ones if necessary
        expected_cols = ['Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 
                         'MIN', 'PTS', 'REB', 'AST', 'FG3M', 'STL', 'BLK', 'TOV', 'FGA', 'FTA'] # Added TOV
        available_cols = [col for col in expected_cols if col in processed_df.columns]
        missing_cols = [col for col in expected_cols if col not in processed_df.columns]
        if missing_cols:
            print(f"Warning: Missing expected columns for initial selection: {missing_cols}. Proceeding with available columns.")
        processed_df = processed_df[available_cols]

        # Ensure necessary columns for calculations are numeric, coercing errors
        numeric_cols_check = ['PTS', 'FGA', 'FTA', 'MIN', 'TOV', 'REB', 'AST', 'FG3M', 'STL', 'BLK'] # Expanded numeric checks
        for col in numeric_cols_check:
            if col in processed_df.columns:
                processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
            else:
                 print(f"Warning: Column {col} needed for processing is missing.")
                 
        # Drop rows where essential numeric columns became NaN after coercion
        # Important: Check if columns exist before adding to subset
        dropna_subset = ['PTS', 'FGA', 'FTA', 'MIN']
        if 'TOV' in processed_df.columns: dropna_subset.append('TOV')
        if 'FGA' in processed_df.columns: dropna_subset.append('FGA')
        if 'FTA' in processed_df.columns: dropna_subset.append('FTA')
        
        # Only drop if the columns actually exist in the DataFrame
        actual_dropna_subset = [col for col in dropna_subset if col in processed_df.columns]
        processed_df.dropna(subset=actual_dropna_subset, inplace=True)

        # Parse Matchup to get Opponent and Home/Away
        def parse_matchup(matchup_str):
            if pd.isna(matchup_str):
                 return 'Unknown', 'Unknown'
            if '@' in matchup_str:
                parts = matchup_str.split(' @ ')
                opponent = parts[1] if len(parts) > 1 else 'Unknown'
                home_away = 'Away'
            elif 'vs.' in matchup_str:
                parts = matchup_str.split(' vs. ')
                opponent = parts[1] if len(parts) > 1 else 'Unknown'
                home_away = 'Home'
            else: 
                opponent = 'Unknown'
                home_away = 'Unknown'
            return opponent, home_away

        if 'MATCHUP' in processed_df.columns:
            # Apply the function and assign results back as new columns
            parsed_matchup = processed_df['MATCHUP'].apply(lambda x: pd.Series(parse_matchup(x), index=['Opponent', 'Home_Away']))
            processed_df = pd.concat([processed_df, parsed_matchup], axis=1)
            # Handle potential empty results from apply if all MATCHUP were NaN
            if 'Opponent' not in processed_df.columns:
                 processed_df['Opponent'] = 'Unknown'
            if 'Home_Away' not in processed_df.columns:
                 processed_df['Home_Away'] = 'Unknown'
        else:
            print("Warning: 'MATCHUP' column not found. Cannot determine Opponent or Home/Away.")
            processed_df['Opponent'] = 'Unknown'
            processed_df['Home_Away'] = 'Unknown'

        print("Data preprocessing complete.")
        print(processed_df.head())
    
else:
    print("Skipping Data Preprocessing because 'all_gamelogs_df' is not available or empty.")
    processed_df = pd.DataFrame() # Ensure processed_df exists even if empty

  processed_df['GAME_DATE'] = pd.to_datetime(processed_df['GAME_DATE'])


Data preprocessing complete.
      Player_ID     Game_ID  GAME_DATE      MATCHUP WL  MIN  PTS  REB  AST  \
4083     101108  0022400074 2024-10-24    SAS @ DAL  L   29    3    7    8   
4082     101108  0022400094 2024-10-26  SAS vs. HOU  W   27    3    0    9   
4081     101108  0022400109 2024-10-28  SAS vs. HOU  L   29   16    1    3   
4080     101108  0022400125 2024-10-30    SAS @ OKC  L   26   14    5    9   
4079     101108  0022400130 2024-10-31    SAS @ UTA  W   32   19    7   10   

      FG3M  STL  BLK  TOV  FGA  FTA Opponent Home_Away  
4083     1    1    0    2    6    0      DAL      Away  
4082     1    1    0    4    3    0      HOU      Home  
4081     3    1    0    2   10    3      HOU      Home  
4080     3    1    1    3    9    1      OKC      Away  
4079     3    2    0    1   13    0      UTA      Away  


In [8]:
# --- Feature Engineering ---

# Check if processed_df exists and is not empty
if 'processed_df' in locals() and not processed_df.empty:
    print("Starting Feature Engineering...")
    
    # Calculate Rest Days
    processed_df['Rest_Days'] = processed_df.groupby('Player_ID')['GAME_DATE'].diff().dt.days
    # Fill first game NaN with a reasonable value (e.g., mode, average, or specific indicator)
    # Using mode is better than a fixed value like 2 if the typical rest isn't 2 days
    if not processed_df['Rest_Days'].mode().empty:
        processed_df['Rest_Days'].fillna(processed_df['Rest_Days'].mode()[0], inplace=True)
    else:
        processed_df['Rest_Days'].fillna(2, inplace=True) # Fallback if mode is empty
    print("Calculated Rest Days.")

    # Add Is_B2B_Second_Night feature
    # B2B is when Rest_Days is exactly 1
    processed_df['Is_B2B_Second_Night'] = processed_df['Rest_Days'].apply(lambda x: 1 if x == 1 else 0)
    print("Calculated Is_B2B_Second_Night.")
    
    # Calculate Player Usage Rate Proxy (Requires FGA, FTA, TOV, MIN)
    # Using the simplified player-level proxy, but will use EWMA on this.
    if all(col in processed_df.columns for col in ['FGA', 'FTA', 'TOV', 'MIN']):
        processed_df['TOV'] = pd.to_numeric(processed_df['TOV'], errors='coerce').fillna(0)
        processed_df['FGA'] = pd.to_numeric(processed_df['FGA'], errors='coerce').fillna(0)
        processed_df['FTA'] = pd.to_numeric(processed_df['FTA'], errors='coerce').fillna(0)
        processed_df['MIN'] = pd.to_numeric(processed_df['MIN'], errors='coerce').fillna(0)
        
        usg_numerator = processed_df['FGA'] + 0.44 * processed_df['FTA'] + processed_df['TOV']
        usg_denominator = processed_df['MIN'] # Using Player MIN as proxy denominator
        processed_df['Player_USG_Proxy'] = np.where(usg_denominator == 0, 0, usg_numerator / usg_denominator)
        processed_df['Player_USG_Proxy'].fillna(0, inplace=True)
        processed_df['Player_USG_Proxy'].replace([np.inf, -np.inf], 0, inplace=True)
        print("Calculated Player Usage Rate Proxy.")
    else:
        missing = [col for col in ['FGA', 'FTA', 'TOV', 'MIN'] if col not in processed_df.columns]
        print(f"Warning: Could not calculate Player Usage Rate Proxy due to missing columns: {missing}.")
        processed_df['Player_USG_Proxy'] = 0 # Assign default value
        
    # Calculate True Shooting Percentage (TS%)
    if all(col in processed_df.columns for col in ['PTS', 'FGA', 'FTA']):
        processed_df['PTS'] = pd.to_numeric(processed_df['PTS'], errors='coerce').fillna(0)
        processed_df['FGA'] = pd.to_numeric(processed_df['FGA'], errors='coerce').fillna(0)
        processed_df['FTA'] = pd.to_numeric(processed_df['FTA'], errors='coerce').fillna(0)
        
        denominator = 2 * (processed_df['FGA'] + 0.44 * processed_df['FTA'])
        processed_df['TS%'] = np.where(denominator == 0, 0, processed_df['PTS'] / denominator)
        processed_df['TS%'].fillna(0, inplace=True)
        processed_df['TS%'].replace([np.inf, -np.inf], 0, inplace=True)
        print("Calculated TS%.")
    else:
        missing = [col for col in ['PTS', 'FGA', 'FTA'] if col not in processed_df.columns]
        print(f"Warning: Could not calculate TS% due to missing columns: {missing}.")
        processed_df['TS%'] = 0 # Assign default value

    # Exponentially Weighted Moving Averages (Sharper Recency)
    cols_for_ewma = ['PTS', 'MIN', 'FGA', 'FTA', 'TS%', 'Player_USG_Proxy'] # Use the calculated features
    ewma_spans = [3, 5] # Spans for EWMA
    
    for col in cols_for_ewma:
        if col in processed_df.columns:
            # Ensure column is numeric before EWMA calculation
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce').fillna(0)
            
            for span in ewma_spans:
                # Calculate EWMA and shift by 1 to get previous game's average
                # Using adjust=False provides a more stable weighting for time series
                processed_df[f'{col}_EWMA_{span}'] = processed_df.groupby('Player_ID')[col].transform(
                    lambda x: x.ewm(span=span, adjust=False).mean().shift(1)
                )
        else:
            print(f"Warning: Column '{col}' not found for EWMA calculation.")

    print("Calculated EWMA features.")

    # Cumulative Season Averages (Shifted) - Keep these for long-term context
    # Ensure required columns exist before cumulative calculations
    cum_avg_base_cols = ['PTS', 'MIN']
    cum_avg_usg_ts_cols = ['FGA', 'FTA', 'TOV'] # Needed for Avg_TS/USG_Season

    # Check if base columns exist
    if all(col in processed_df.columns for col in cum_avg_base_cols):
        processed_df['Cum_PTS'] = processed_df.groupby('Player_ID')['PTS'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_MIN'] = processed_df.groupby('Player_ID')['MIN'].transform(lambda x: x.expanding().sum().shift(1))
        processed_df['Cum_Games'] = processed_df.groupby('Player_ID').cumcount() 

        processed_df['Avg_PTS_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_Games']).replace([np.inf, -np.inf, np.nan], 0)
        processed_df['PTS_Per36_Season'] = (processed_df['Cum_PTS'] / processed_df['Cum_MIN'] * 36).replace([np.inf, -np.inf, np.nan], 0)
        print("Calculated base cumulative averages.")
    else:
         missing_base_cum = [col for col in cum_avg_base_cols if col not in processed_df.columns]
         print(f"Warning: One or more columns ({missing_base_cum}) missing for base cumulative average calculation.")

    # Check if USG/TS columns exist for their cumulative averages
    if all(col in processed_df.columns for col in cum_avg_usg_ts_cols):
         # Need Cum_PTS, Cum_FGA, Cum_FTA, Cum_TOV, Cum_MIN
        if 'Cum_PTS' not in processed_df.columns:
             processed_df['Cum_PTS'] = processed_df.groupby('Player_ID')['PTS'].transform(lambda x: x.expanding().sum().shift(1)).fillna(0)
        if 'Cum_MIN' not in processed_df.columns:
             processed_df['Cum_MIN'] = processed_df.groupby('Player_ID')['MIN'].transform(lambda x: x.expanding().sum().shift(1)).fillna(0)
             
        processed_df['Cum_FGA'] = processed_df.groupby('Player_ID')['FGA'].transform(lambda x: x.expanding().sum().shift(1)).fillna(0)
        processed_df['Cum_FTA'] = processed_df.groupby('Player_ID')['FTA'].transform(lambda x: x.expanding().sum().shift(1)).fillna(0)
        processed_df['Cum_TOV'] = processed_df.groupby('Player_ID')['TOV'].transform(lambda x: x.expanding().sum().shift(1)).fillna(0)
        
        # Cumulative TS% calculation
        cum_ts_denominator = 2 * (processed_df['Cum_FGA'] + 0.44 * processed_df['Cum_FTA'])
        processed_df['Avg_TS%_Season'] = np.where(cum_ts_denominator == 0, 0, processed_df['Cum_PTS'] / cum_ts_denominator)
        processed_df['Avg_TS%_Season'].fillna(0, inplace=True)
        processed_df['Avg_TS%_Season'].replace([np.inf, -np.inf], 0, inplace=True)
        
        # Cumulative USG% Proxy calculation
        # Sticking to simplified proxy with player MIN 
        cum_usg_numerator = processed_df['Cum_FGA'] + 0.44 * processed_df['Cum_FTA'] + processed_df['Cum_TOV']
        cum_usg_denominator = processed_df['Cum_MIN'] 
        processed_df['Avg_USG%_Proxy_Season'] = np.where(cum_usg_denominator == 0, 0, cum_usg_numerator / cum_usg_denominator)
        processed_df['Avg_USG%_Proxy_Season'].fillna(0, inplace=True)
        processed_df['Avg_USG%_Proxy_Season'].replace([np.inf, -np.inf], 0, inplace=True)
        
        print("Calculated cumulative averages for TS% and USG% Proxy.")
    else:
         missing_usg_ts_cum = [col for col in cum_avg_usg_ts_cols if col not in processed_df.columns]
         print(f"Warning: One or more columns ({missing_usg_ts_cum}) missing for TS/USG cumulative average calculation.")

    # Other Features
    if 'Home_Away' in processed_df.columns:
        processed_df['Is_Home'] = processed_df['Home_Away'].apply(lambda x: 1 if x == 'Home' else 0)
    else:
        processed_df['Is_Home'] = 0 # Default if Home_Away is missing

    # Merge Opponent Stats (DEF_RATING and PACE)
    # Use fallback values if team_stats_df is empty (due to fetch error)
    DEFAULT_DEF_RATING = 115.0 
    DEFAULT_PACE = 100.0

    if 'team_stats_df' in locals() and not team_stats_df.empty and 'TEAM_ABBREVIATION' in team_stats_df.columns and 'Opponent' in processed_df.columns:
        print("Merging fetched team defensive stats and pace...")
        team_stats_to_merge = team_stats_df.rename(columns={
            'TEAM_ABBREVIATION': 'Opponent',
            'DEF_RATING': 'Opponent_DEF_RATING',
            'PACE': 'Opponent_PACE'
        })
        
        # Ensure 'Opponent' column type alignment
        try:
            processed_df['Opponent'] = processed_df['Opponent'].astype(team_stats_to_merge['Opponent'].dtype)
        except Exception as e:
            print(f"Warning: Could not align Opponent column types for merge: {e}")
            
        # Merge only the required columns from team_stats_to_merge
        merge_cols = ['Opponent']
        if 'Opponent_DEF_RATING' in team_stats_to_merge.columns: merge_cols.append('Opponent_DEF_RATING')
        if 'Opponent_PACE' in team_stats_to_merge.columns: merge_cols.append('Opponent_PACE')

        if len(merge_cols) > 1:
            processed_df = pd.merge(processed_df, team_stats_to_merge[merge_cols], on='Opponent', how='left')
            
            # Handle NaNs after merge
            if 'Opponent_DEF_RATING' in processed_df.columns:
                 processed_df['Opponent_DEF_RATING'] = pd.to_numeric(processed_df['Opponent_DEF_RATING'], errors='coerce')
                 avg_def_rating = processed_df['Opponent_DEF_RATING'].mean()
                 fill_value_def = avg_def_rating if not pd.isna(avg_def_rating) else DEFAULT_DEF_RATING
                 processed_df['Opponent_DEF_RATING'].fillna(fill_value_def, inplace=True)
                 print(f"Opponent DEF_RATING merged. Filled NaNs with {fill_value_def:.1f}.")
            else:
                 print("Warning: 'Opponent_DEF_RATING' column not created after merge. Using default.")
                 processed_df['Opponent_DEF_RATING'] = DEFAULT_DEF_RATING

            if 'Opponent_PACE' in processed_df.columns:
                 processed_df['Opponent_PACE'] = pd.to_numeric(processed_df['Opponent_PACE'], errors='coerce')
                 avg_pace = processed_df['Opponent_PACE'].mean()
                 fill_value_pace = avg_pace if not pd.isna(avg_pace) else DEFAULT_PACE
                 processed_df['Opponent_PACE'].fillna(fill_value_pace, inplace=True)
                 print(f"Opponent PACE merged. Filled NaNs with {fill_value_pace:.1f}.")
            else:
                 print("Warning: 'Opponent_PACE' column not created after merge. Using default.")
                 processed_df['Opponent_PACE'] = DEFAULT_PACE

        else:
             print("Warning: No relevant team stats columns found for merging.")
             processed_df['Opponent_DEF_RATING'] = DEFAULT_DEF_RATING
             processed_df['Opponent_PACE'] = DEFAULT_PACE

    else:
        print(f"Warning: Team stats not available or Opponent column missing. Using default DEF_RATING ({DEFAULT_DEF_RATING}) and PACE ({DEFAULT_PACE}).")
        processed_df['Opponent_DEF_RATING'] = DEFAULT_DEF_RATING
        processed_df['Opponent_PACE'] = DEFAULT_PACE

    # Final Fill NA for all engineered features (EWMA, cumulative, boolean, opponent)
    # Identify all potential feature columns created dynamically
    engineered_cols = [col for col in processed_df.columns 
                       if '_EWMA_' in col 
                       or '_Roll_' in col # Keep roll for now if EWMA fails, or remove if EWMA is primary
                       or '_Season' in col 
                       or col in ['Is_Home', 'Opponent_DEF_RATING', 'Opponent_PACE', 'Rest_Days', 'Is_B2B_Second_Night']]
    
    # Filter to only columns that actually exist in the DataFrame
    existing_engineered_cols = [col for col in engineered_cols if col in processed_df.columns]
    
    # Fill NaNs created by shifting/EWMA at the beginning of a player's season
    if existing_engineered_cols:
        processed_df[existing_engineered_cols] = processed_df[existing_engineered_cols].fillna(0)
        print(f"Filled NaNs in engineered features: {existing_engineered_cols}")
    else:
        print("No engineered features found to fill NaNs.")

    print("Feature Engineering complete.")
    print(processed_df.head(10))
else:
    print("Skipping Feature Engineering because 'processed_df' is not available or empty.")

Starting Feature Engineering...
Calculated Rest Days.
Calculated Is_B2B_Second_Night.
Calculated Player Usage Rate Proxy.
Calculated TS%.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Rest_Days'].fillna(processed_df['Rest_Days'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Player_USG_Proxy'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

Calculated EWMA features.
Calculated base cumulative averages.
Calculated cumulative averages for TS% and USG% Proxy.
Merging fetched team defensive stats and pace...
Opponent DEF_RATING merged. Filled NaNs with 113.8.
Filled NaNs in engineered features: ['Rest_Days', 'Is_B2B_Second_Night', 'PTS_EWMA_3', 'PTS_EWMA_5', 'MIN_EWMA_3', 'MIN_EWMA_5', 'FGA_EWMA_3', 'FGA_EWMA_5', 'FTA_EWMA_3', 'FTA_EWMA_5', 'TS%_EWMA_3', 'TS%_EWMA_5', 'Player_USG_Proxy_EWMA_3', 'Player_USG_Proxy_EWMA_5', 'Avg_PTS_Season', 'PTS_Per36_Season', 'Avg_TS%_Season', 'Avg_USG%_Proxy_Season', 'Is_Home', 'Opponent_DEF_RATING', 'Opponent_PACE']
Feature Engineering complete.
   Player_ID     Game_ID  GAME_DATE      MATCHUP WL  MIN  PTS  REB  AST  FG3M  \
0     101108  0022400074 2024-10-24    SAS @ DAL  L   29    3    7    8     1   
1     101108  0022400094 2024-10-26  SAS vs. HOU  W   27    3    0    9     1   
2     101108  0022400109 2024-10-28  SAS vs. HOU  L   29   16    1    3     3   
3     101108  0022400125 202

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Avg_TS%_Season'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_df['Avg_TS%_Season'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

In [9]:
# --- Prepare Data for Modeling ---

# Check if processed_df exists and is not empty
if 'processed_df' in locals() and not processed_df.empty:
    print("Preparing data for modeling...")
    
    # Define required columns based on features engineered
    # Prefer EWMA over simple rolling if both exist
    feature_candidates = [
        'PTS_EWMA_3', 'PTS_EWMA_5',
        'MIN_EWMA_3', 'MIN_EWMA_5',
        'FGA_EWMA_3', 'FGA_EWMA_5',
        'FTA_EWMA_3', 'FTA_EWMA_5',
        'TS%_EWMA_3', 'TS%_EWMA_5',
        'Player_USG_Proxy_EWMA_3', 'Player_USG_Proxy_EWMA_5',
        'Avg_PTS_Season', 'PTS_Per36_Season', 'Avg_TS%_Season', 'Avg_USG%_Proxy_Season',
        'Opponent_DEF_RATING', 'Opponent_PACE',
        'Rest_Days', 'Is_B2B_Second_Night', 'Is_Home'
    ]
    
    # Filter to only columns that actually exist in processed_df
    features = [col for col in feature_candidates if col in processed_df.columns]
    
    # Add simple rolling average fallbacks if EWMA was not created
    fallback_features = [
        'PTS_Roll_3', 'PTS_Roll_5',
        'MIN_Roll_3', 'MIN_Roll_5',
        'FGA_Roll_3', 'FGA_Roll_5',
        'FTA_Roll_3', 'FTA_Roll_5',
        'TS%_Roll_3', 'TS%_Roll_5',
        'Player_USG_Proxy_Roll_3', 'Player_USG_Proxy_Roll_5'
    ]
    for fb_col in fallback_features:
        # Add the fallback if the EWMA equivalent is NOT in features AND the fallback exists
        ewma_equiv = fb_col.replace('_Roll_', '_EWMA_')
        if fb_col in processed_df.columns and ewma_equiv not in features and fb_col not in features:
             features.append(fb_col)
             
    print(f"Features selected for modeling: {features}")

    # Define target variable
    target = 'PTS'
    
    if target not in processed_df.columns:
        print(f"Error: Target variable '{target}' not found in processed_df. Cannot proceed with modeling.")
        model_df = pd.DataFrame()
        X_train, X_test, y_train, y_test, y_test_original = [None]*5
    else:
        # Drop rows where target or any selected feature is missing
        # NaNs in engineered features should be handled by fillna(0) in Feature Engineering 
        subset_for_dropna = [target] + features
        actual_subset_for_dropna = [col for col in subset_for_dropna if col in processed_df.columns]
        model_df = processed_df.dropna(subset=actual_subset_for_dropna).copy()

        # Check if enough data remains
        if model_df.empty or len(model_df) < 10:
            print("Not enough data with required features and target to build a model.")
            X_train, X_test, y_train, y_test, y_test_original = [None]*5 
        else:
            X = model_df[features]
            y = model_df[target]
            
            # Check for NaN/inf in features or target before split
            if X.isnull().values.any() or y.isnull().values.any() or np.isinf(X.values).any() or np.isinf(y.values).any():
                 print("Warning: NaN or Inf values detected before train/test split. Attempting to fill with 0.")
                 X = X.fillna(0)
                 y = y.fillna(0)
                 X = X.replace([np.inf, -np.inf], 0)
                 y = y.replace([np.inf, -np.inf], 0)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
            
            y_test_original = y_test.copy()

            # Apply Target Transformation (log1p)
            print(f"Applying log1p transformation to target variable '{target}'.")
            y_train = np.log1p(y_train)
            y_test = np.log1p(y_test)
            
            print(f"Data prepared for modeling (target transformed). Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")
            print(f"Original Min/Max PTS in Test Set: {y_test_original.min()}/{y_test_original.max()}")
            print(f"Transformed Min/Max PTS in Test Set: {y_test.min():.4f}/{y_test.max():.4f}")

else:
    print("Skipping Data Preparation for Modeling because 'processed_df' is not available or empty.")
    X_train, X_test, y_train, y_test, y_test_original = [None]*5

Preparing data for modeling...
Features selected for modeling: ['PTS_EWMA_3', 'PTS_EWMA_5', 'MIN_EWMA_3', 'MIN_EWMA_5', 'FGA_EWMA_3', 'FGA_EWMA_5', 'FTA_EWMA_3', 'FTA_EWMA_5', 'TS%_EWMA_3', 'TS%_EWMA_5', 'Player_USG_Proxy_EWMA_3', 'Player_USG_Proxy_EWMA_5', 'Avg_PTS_Season', 'PTS_Per36_Season', 'Avg_TS%_Season', 'Avg_USG%_Proxy_Season', 'Opponent_DEF_RATING', 'Opponent_PACE', 'Rest_Days', 'Is_B2B_Second_Night', 'Is_Home']
Applying log1p transformation to target variable 'PTS'.
Data prepared for modeling (target transformed). Training set size: 5832, Testing set size: 1458
Original Min/Max PTS in Test Set: 0/49
Transformed Min/Max PTS in Test Set: 0.0000/3.9120


In [10]:
# --- Analyze Distribution and Segment Data ---

# Check if X_train and y_test_original exist and are not None
if 'X_train' in locals() and X_train is not None and 'y_test_original' in locals() and y_test_original is not None:
    print("--- Avg_PTS_Season Distribution (Training Data) ---")
    # Use the original Avg_PTS_Season from X_train which was not log-transformed
    if 'Avg_PTS_Season' in X_train.columns:
        print(X_train['Avg_PTS_Season'].describe())
        
        print("\n--- Sample Counts for Potential Thresholds (Training Data) ---")
        for threshold in [12, 15, 18]:
            count = (X_train['Avg_PTS_Season'] >= threshold).sum()
            print(f"Players >= {threshold} PPG: {count} ({(count / len(X_train) * 100):.1f}%)")
            
        # Define the chosen threshold (using the same threshold as before)
        scorer_threshold = 15
        print(f"\nUsing threshold: Avg_PTS_Season >= {scorer_threshold}")
        
        # Create segmentation masks for TRAIN and TEST sets
        train_scorer_mask = X_train['Avg_PTS_Season'] >= scorer_threshold
        test_scorer_mask = X_test['Avg_PTS_Season'] >= scorer_threshold # Use X_test for the mask
        
        # Segment Training Data (X is features, y is TRANSFORMED target)
        X_train_scorers = X_train[train_scorer_mask]
        y_train_scorers = y_train[train_scorer_mask]
        X_train_role = X_train[~train_scorer_mask]
        y_train_role = y_train[~train_scorer_mask]
        
        # Segment Testing Data (X is features, y is TRANSFORMED target, y_original is ORIGINAL target)
        X_test_scorers = X_test[test_scorer_mask]
        y_test_scorers = y_test[test_scorer_mask]
        y_test_scorers_original = y_test_original[test_scorer_mask] # Segment original target too

        X_test_role = X_test[~test_scorer_mask]
        y_test_role = y_test[~test_scorer_mask]
        y_test_role_original = y_test_original[~test_scorer_mask] # Segment original target too
        
        print("\n--- Data Segmentation Complete ---")
        print(f"Scorers - Train: {len(X_train_scorers)}, Test: {len(X_test_scorers)}")
        print(f"Role Players - Train: {len(X_train_role)}, Test: {len(X_test_role)}")
        
        # Store segment masks for later combined evaluation
        test_scorer_mask_global = test_scorer_mask
        test_role_mask_global = ~test_scorer_mask

    else:
        print("Warning: 'Avg_PTS_Season' not found in X_train. Skipping segmentation.")
        X_train_scorers, y_train_scorers, X_test_scorers, y_test_scorers, y_test_scorers_original = [None]*5
        X_train_role, y_train_role, X_test_role, y_test_role, y_test_role_original = [None]*5
        test_scorer_mask_global = None
        test_role_mask_global = None

else:
    print("Skipping segmentation as training data or original test target is not available.")
    X_train_scorers, y_train_scorers, X_test_scorers, y_test_scorers, y_test_scorers_original = [None]*5
    X_train_role, y_train_role, X_test_role, y_test_role, y_test_role_original = [None]*5
    test_scorer_mask_global = None
    test_role_mask_global = None

--- Avg_PTS_Season Distribution (Training Data) ---
count    5832.000000
mean       10.465235
std         6.941945
min         0.000000
25%         4.877287
50%         9.318665
75%        15.112022
max        36.000000
Name: Avg_PTS_Season, dtype: float64

--- Sample Counts for Potential Thresholds (Training Data) ---
Players >= 12 PPG: 2030 (34.8%)
Players >= 15 PPG: 1479 (25.4%)
Players >= 18 PPG: 985 (16.9%)

Using threshold: Avg_PTS_Season >= 15

--- Data Segmentation Complete ---
Scorers - Train: 1479, Test: 393
Role Players - Train: 4353, Test: 1065


In [11]:
# --- Model Evaluation Function (Modified for Transformed Target) ---

def evaluate_model(model, X_test, y_test_transformed, y_test_original, model_name, X_train_cols=None):
    """Calculates and prints evaluation metrics for a given model, handling transformed targets."""
    print(f"\n--- {model_name} Evaluation ---")
    
    if model is None or X_test is None or y_test_original is None or X_test.empty:
        print(f"Skipping {model_name} evaluation as model was not trained or test data was missing.")
        return
        
    try:
        # Predict on the transformed scale
        y_pred_transformed = model.predict(X_test)

        # Transform predictions back to the original scale
        y_pred_original = np.expm1(y_pred_transformed)
        y_pred_original[y_pred_original < 0] = 0 

        # --- Basic Error Analysis (using original scale) ---
        if X_test.index.equals(y_test_original.index):
             X_test_results = X_test.copy()
             X_test_results['Actual_PTS'] = y_test_original
             X_test_results['Predicted_PTS'] = y_pred_original
             X_test_results['Error'] = X_test_results['Actual_PTS'] - X_test_results['Predicted_PTS']
             X_test_results['Abs_Error'] = np.abs(X_test_results['Error'])
        else:
             print("Warning: X_test index does not match y_test_original index. Skipping detailed error analysis table.")
             X_test_results = pd.DataFrame({'Actual_PTS': y_test_original, 'Predicted_PTS': y_pred_original})
             X_test_results['Error'] = X_test_results['Actual_PTS'] - X_test_results['Predicted_PTS']
             X_test_results['Abs_Error'] = np.abs(X_test_results['Error'])

        # --- Calculate Metrics (using original scale) ---
        mae = mean_absolute_error(y_test_original, y_pred_original)
        mse = mean_squared_error(y_test_original, y_pred_original) 
        rmse = np.sqrt(mse) 
        
        non_zero_mask = y_test_original != 0
        if np.any(non_zero_mask):
            mape = mean_absolute_percentage_error(y_test_original[non_zero_mask], y_pred_original[non_zero_mask])
            mape_note = f"(excluding { (~non_zero_mask).sum() } games with 0 actual points)"
        else:
            mape = np.nan
            mape_note = "(MAPE not calculable)"
        
        within_3_pts = X_test_results['Abs_Error'] <= 3
        within_3_pts_accuracy = within_3_pts.mean() * 100
        
        # --- Print Metrics ---
        print(f"Mean Absolute Error (MAE): {mae:.2f}")
        print(f"Root Mean Squared Error (RMSE): {rmse:.2f}") 
        print(f"Mean Absolute Percentage Error (MAPE): {mape:.2%} {mape_note}")
        print(f"Accuracy (within +/- 3 points): {within_3_pts_accuracy:.2f}%")
        
        # --- Print Sample Predictions and Errors ---
        print(f"\nSample Predictions vs Actual ({model_name}):")
        print(X_test_results.head())
        
        print("\nLargest Errors (Top 5):")
        print(X_test_results.sort_values(by='Abs_Error', ascending=False).head())
        
        # Optional: Feature Importances (Specific to tree-based models)
        if hasattr(model, 'feature_importances_') and X_train_cols is not None:
            print(f"\nFeature Importances ({model_name}):")
            importance_features = X_train_cols if isinstance(X_train_cols, pd.Index) else pd.Index(X_train_cols)
            if len(importance_features) == len(model.feature_importances_):
                importances = pd.DataFrame({
                    'Feature': importance_features,
                    'Importance': model.feature_importances_
                }).sort_values(by='Importance', ascending=False)
                print(importances)
            else:
                print("Warning: Feature count mismatch between X_train_cols and model importances. Cannot display importances.")

    except Exception as e:
        print(f"Error during {model_name} evaluation: {e}")
        traceback.print_exc()

In [12]:
# --- Function to Train, Evaluate, and Save Models for a Segment ---

def train_evaluate_segment(X_train_seg, y_train_seg_transformed, X_test_seg, y_test_seg_original, segment_label):
    """Tunes, trains, evaluates, and saves Ridge, XGBoost, and LightGBM models for a data segment."""
    global MODEL_DIR # Use the global model directory variable
    print(f"\n{'='*20} Processing Segment: {segment_label} {'='*20}")
    
    if X_train_seg is None or y_train_seg_transformed is None or X_train_seg.empty or len(X_train_seg) < 10:
        print(f"Skipping {segment_label} segment due to insufficient training data ({len(X_train_seg) if X_train_seg is not None else 0} samples).")
        return None, None, None
        
    model_ridge_seg = None
    model_xgb_seg = None
    model_lgbm_seg = None
    
    # --- Ridge --- 
    print(f"\n--- Tuning Ridge Regression Model ({segment_label}) ---")
    param_grid_ridge = {'alpha': [0.1, 1.0, 10.0, 100.0]}
    ridge_estimator = Ridge()
    tscv = TimeSeriesSplit(n_splits=5)
    grid_search_ridge = GridSearchCV(ridge_estimator, param_grid_ridge, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1)
    try:
        grid_search_ridge.fit(X_train_seg, y_train_seg_transformed)
        model_ridge_seg = grid_search_ridge.best_estimator_
        print(f"Best alpha found for Ridge ({segment_label}): {grid_search_ridge.best_params_['alpha']}")
        print(f"Tuned Ridge Regression Model training complete ({segment_label}).")
        save_path = os.path.join(MODEL_DIR, f'ridge_model_{segment_label.lower().replace(" ", "_")}.pkl')
        joblib.dump(model_ridge_seg, save_path)
        print(f"Ridge model saved for {segment_label} to {save_path}.")
    except Exception as e:
        print(f"Error during Ridge GridSearchCV ({segment_label}): {e}")
        traceback.print_exc()
        model_ridge_seg = None
    evaluate_model(model_ridge_seg, X_test_seg, y_test_seg_original, y_test_seg_original, f"Tuned Ridge ({segment_label})")
    
    # --- XGBoost ---
    print(f"\n--- Tuning XGBoost Model ({segment_label}) ---")
    param_grid_xgb = {
        'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0], 'reg_alpha': [0, 0.1], 'reg_lambda': [1.0]
    }
    if len(X_train_seg) > 500:
        param_grid_xgb['n_estimators'] = [100, 200, 300]
        param_grid_xgb['max_depth'] = [3, 5, 7]
        param_grid_xgb['learning_rate'] = [0.01, 0.05, 0.1]
        param_grid_xgb['subsample'] = [0.7, 0.8, 0.9, 1.0]
        param_grid_xgb['colsample_bytree'] = [0.7, 0.8, 0.9, 1.0]
        param_grid_xgb['reg_alpha'] = [0, 0.1, 0.5]
        param_grid_xgb['reg_lambda'] = [0.5, 1.0, 1.5]
    xgb_estimator = XGBRegressor(random_state=42, objective='reg:squarederror', tree_method='hist')
    grid_search_xgb = GridSearchCV(estimator=xgb_estimator, param_grid=param_grid_xgb, scoring='neg_mean_absolute_error', cv=tscv, n_jobs=-1, verbose=0)
    try:
        print(f"Starting XGBoost GridSearchCV ({segment_label})...")
        grid_search_xgb.fit(X_train_seg, y_train_seg_transformed)
        model_xgb_seg = grid_search_xgb.best_estimator_
        print(f"Best parameters found for XGBoost ({segment_label}): {grid_search_xgb.best_params_}")
        print(f"Best MAE score during CV ({segment_label}): {-grid_search_xgb.best_score_:.2f}")
        print(f"Tuned XGBoost Model training complete ({segment_label}).")
        save_path = os.path.join(MODEL_DIR, f'xgb_model_{segment_label.lower().replace(" ", "_")}.pkl')
        joblib.dump(model_xgb_seg, save_path)
        print(f"XGBoost model saved for {segment_label} to {save_path}.")
    except Exception as e:
        print(f"Error during XGBoost GridSearchCV ({segment_label}): {e}")
        traceback.print_exc()
        model_xgb_seg = None
    evaluate_model(model_xgb_seg, X_test_seg, y_test_seg_original, y_test_seg_original, f"Tuned XGBoost ({segment_label})", X_train_cols=X_train_seg.columns)
    
    # --- LightGBM ---
    print(f"\n--- Tuning LightGBM Model ({segment_label}) ---")
    param_grid_lgbm = {
        'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0], 'reg_alpha': [0, 0.1], 'reg_lambda': [1.0]
    }
    if len(X_train_seg) > 500:
        param_grid_lgbm['n_estimators'] = [100, 200, 300]
        param_grid_lgbm['max_depth'] = [3, 5, 7]
        param_grid_lgbm['learning_rate'] = [0.01, 0.05, 0.1]
        param_grid_lgbm['subsample'] = [0.7, 0.8, 0.9, 1.0]
        param_grid_lgbm['colsample_bytree'] = [0.7, 0.8, 0.9, 1.0]
        param_grid_lgbm['reg_alpha'] = [0, 0.1, 0.5]
        param_grid_lgbm['reg_lambda'] = [0.5, 1.0, 1.5]
    lgbm_estimator = lgb.LGBMRegressor(random_state=42, objective='regression_l2')
    grid_search_lgbm = GridSearchCV(estimator=lgbm_estimator, param_grid=param_grid_lgbm, scoring='neg_mean_absolute_error', cv=tscv, n_jobs=-1, verbose=0)
    try:
        print(f"Starting LightGBM GridSearchCV ({segment_label})...")
        grid_search_lgbm.fit(X_train_seg, y_train_seg_transformed)
        model_lgbm_seg = grid_search_lgbm.best_estimator_
        print(f"Best parameters found for LightGBM ({segment_label}): {grid_search_lgbm.best_params_}")
        print(f"Best MAE score during CV ({segment_label}): {-grid_search_lgbm.best_score_:.2f}")
        print(f"Tuned LightGBM Model training complete ({segment_label}).")
        save_path = os.path.join(MODEL_DIR, f'lgbm_model_{segment_label.lower().replace(" ", "_")}.pkl')
        joblib.dump(model_lgbm_seg, save_path)
        print(f"LightGBM model saved for {segment_label} to {save_path}.")
    except Exception as e:
        print(f"Error during LightGBM GridSearchCV ({segment_label}): {e}")
        traceback.print_exc()
        model_lgbm_seg = None
    evaluate_model(model_lgbm_seg, X_test_seg, y_test_seg_original, y_test_seg_original, f"Tuned LightGBM ({segment_label})", X_train_cols=X_train_seg.columns)
    
    print(f"\n{'='*20} Finished Segment: {segment_label} {'='*20}")
    return model_ridge_seg, model_xgb_seg, model_lgbm_seg

# Ensure segmented variables are initialized
if 'X_train_scorers' not in locals():
     X_train_scorers, y_train_scorers, X_test_scorers, y_test_scorers, y_test_scorers_original = [None]*5
     X_train_role, y_train_role, X_test_role, y_test_role, y_test_role_original = [None]*5
     test_scorer_mask_global = None
     test_role_mask_global = None

In [13]:
# --- Model Building & Tuning (Ridge Regression - Baseline) ---
print("\n === BASELINE MODEL (ALL DATA) ===")
model_ridge_baseline = None
if 'X_train' in locals() and X_train is not None and 'y_train' in locals() and y_train is not None and not X_train.empty:
    print("Tuning Ridge Regression Model (Baseline)...")
    param_grid_ridge = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]}
    ridge_estimator = Ridge()
    tscv = TimeSeriesSplit(n_splits=5)
    grid_search_ridge = GridSearchCV(ridge_estimator, param_grid_ridge, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1)
    try:
        grid_search_ridge.fit(X_train, y_train)
        model_ridge_baseline = grid_search_ridge.best_estimator_
        print(f"Best alpha found for Ridge (Baseline): {grid_search_ridge.best_params_['alpha']}")
        print("Tuned Ridge Regression Model training complete (Baseline).")
        save_path = os.path.join(MODEL_DIR, 'ridge_model_baseline.pkl')
        joblib.dump(model_ridge_baseline, save_path)
        print(f"Baseline Ridge model saved to {save_path}.")
    except Exception as e:
        print(f"Error during Ridge GridSearchCV (Baseline): {e}")
        traceback.print_exc()
        model_ridge_baseline = None
else:
    print("Skipping Ridge tuning due to lack of training data.")


 === BASELINE MODEL (ALL DATA) ===
Tuning Ridge Regression Model (Baseline)...
Best alpha found for Ridge (Baseline): 0.01
Tuned Ridge Regression Model training complete (Baseline).
Baseline Ridge model saved to models/2024-25\ridge_model_baseline.pkl.


In [14]:
# --- Model Evaluation (Tuned Ridge Regression - Baseline) ---
if 'model_ridge_baseline' in locals() and model_ridge_baseline is not None:
    evaluate_model(model_ridge_baseline, X_test, y_test, y_test_original, "Tuned Ridge (Baseline)") 
else:
    print("Skipping Baseline Ridge evaluation as model was not trained.")


--- Tuned Ridge (Baseline) Evaluation ---
Mean Absolute Error (MAE): 5.01
Root Mean Squared Error (RMSE): 6.87
Mean Absolute Percentage Error (MAPE): 50.14% (excluding 145 games with 0 actual points)
Accuracy (within +/- 3 points): 43.42%

Sample Predictions vs Actual (Tuned Ridge (Baseline)):
      PTS_EWMA_3  PTS_EWMA_5  MIN_EWMA_3  MIN_EWMA_5  FGA_EWMA_3  FGA_EWMA_5  \
1056   20.392953   18.781173   33.005499   31.911444   14.186562   13.632315   
5907   21.141338   16.250644   27.370876   21.757188   12.919287   10.250668   
866    16.532715   17.091862   32.554688   33.324296   11.227539   12.216747   
3613   31.656860   27.842186   35.452789   34.666517   18.266296   16.386177   
3035    6.310892    6.759531   14.724441   15.936651    4.650916    5.245164   

      FTA_EWMA_3  FTA_EWMA_5  TS%_EWMA_3  TS%_EWMA_5  ...  \
1056    7.878767    6.073409    0.573870    0.568364  ...   
5907    1.067596    0.968590    0.616145    0.533709  ...   
866     3.194336    3.075880    0.665997

In [15]:
# --- Model Building & Tuning (XGBoost - Baseline) ---
model_xgb_baseline = None
if 'X_train' in locals() and X_train is not None and 'y_train' in locals() and y_train is not None and not X_train.empty:
    print("\n--- Tuning XGBoost Model (Baseline) ---")
    param_grid_xgb = {
        'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5], 'reg_lambda': [0.5, 1.0, 1.5]
    }
    xgb_estimator = XGBRegressor(random_state=42, objective='reg:squarederror', tree_method='hist')
    tscv = TimeSeriesSplit(n_splits=5)
    grid_search_xgb = GridSearchCV(estimator=xgb_estimator, param_grid=param_grid_xgb, scoring='neg_mean_absolute_error', cv=tscv, n_jobs=-1, verbose=1)
    try:
        print("Starting XGBoost GridSearchCV (this may take a while)...")
        grid_search_xgb.fit(X_train, y_train)
        model_xgb_baseline = grid_search_xgb.best_estimator_
        print(f"\nBest parameters found for XGBoost (Baseline): {grid_search_xgb.best_params_}")
        print(f"Best MAE score during CV (Baseline): {-grid_search_xgb.best_score_:.2f}")
        print("Tuned XGBoost Model training complete (Baseline).")
        save_path = os.path.join(MODEL_DIR, 'xgb_model_baseline.pkl')
        joblib.dump(model_xgb_baseline, save_path)
        print(f"Baseline XGBoost model saved to {save_path}.")
    except Exception as e:
        print(f"Error during XGBoost GridSearchCV (Baseline): {e}")
        traceback.print_exc()
        model_xgb_baseline = None
else:
    print("Skipping XGBoost tuning due to lack of training data.")


--- Tuning XGBoost Model (Baseline) ---
Starting XGBoost GridSearchCV (this may take a while)...
Fitting 5 folds for each of 3888 candidates, totalling 19440 fits

Best parameters found for XGBoost (Baseline): {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 1.0, 'subsample': 0.7}
Best MAE score during CV (Baseline): 0.56
Tuned XGBoost Model training complete (Baseline).
Baseline XGBoost model saved to models/2024-25\xgb_model_baseline.pkl.


In [16]:
# --- Model Evaluation (Tuned XGBoost - Baseline) ---
train_cols = X_train.columns if X_train is not None else None
if 'model_xgb_baseline' in locals() and model_xgb_baseline is not None:
    evaluate_model(model_xgb_baseline, X_test, y_test, y_test_original, "Tuned XGBoost (Baseline)", X_train_cols=train_cols) 
else:
    print("Skipping Baseline XGBoost evaluation as model was not trained.")


--- Tuned XGBoost (Baseline) Evaluation ---
Mean Absolute Error (MAE): 4.82
Root Mean Squared Error (RMSE): 6.60
Mean Absolute Percentage Error (MAPE): 49.69% (excluding 145 games with 0 actual points)
Accuracy (within +/- 3 points): 45.68%

Sample Predictions vs Actual (Tuned XGBoost (Baseline)):
      PTS_EWMA_3  PTS_EWMA_5  MIN_EWMA_3  MIN_EWMA_5  FGA_EWMA_3  FGA_EWMA_5  \
1056   20.392953   18.781173   33.005499   31.911444   14.186562   13.632315   
5907   21.141338   16.250644   27.370876   21.757188   12.919287   10.250668   
866    16.532715   17.091862   32.554688   33.324296   11.227539   12.216747   
3613   31.656860   27.842186   35.452789   34.666517   18.266296   16.386177   
3035    6.310892    6.759531   14.724441   15.936651    4.650916    5.245164   

      FTA_EWMA_3  FTA_EWMA_5  TS%_EWMA_3  TS%_EWMA_5  ...  \
1056    7.878767    6.073409    0.573870    0.568364  ...   
5907    1.067596    0.968590    0.616145    0.533709  ...   
866     3.194336    3.075880    0.66

In [17]:
# --- Model Building & Tuning (LightGBM - Baseline) ---
model_lgbm_baseline = None
if 'X_train' in locals() and X_train is not None and 'y_train' in locals() and y_train is not None and not X_train.empty:
    print("\n--- Tuning LightGBM Model (Baseline) ---")
    param_grid_lgbm = {
        'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5], 'reg_lambda': [0.5, 1.0, 1.5]
    }
    lgbm_estimator = lgb.LGBMRegressor(random_state=42, objective='regression_l2')
    tscv = TimeSeriesSplit(n_splits=5)
    grid_search_lgbm = GridSearchCV(estimator=lgbm_estimator, param_grid=param_grid_lgbm, scoring='neg_mean_absolute_error', cv=tscv, n_jobs=-1, verbose=1)
    try:
        print("Starting LightGBM GridSearchCV (this may take a while)...")
        grid_search_lgbm.fit(X_train, y_train)
        model_lgbm_baseline = grid_search_lgbm.best_estimator_
        print(f"\nBest parameters found for LightGBM (Baseline): {grid_search_lgbm.best_params_}")
        print(f"Best MAE score during CV (Baseline): {-grid_search_lgbm.best_score_:.2f}")
        print("Tuned LightGBM Model training complete (Baseline).")
        save_path = os.path.join(MODEL_DIR, 'lgbm_model_baseline.pkl')
        joblib.dump(model_lgbm_baseline, save_path)
        print(f"Baseline LightGBM model saved to {save_path}.")
    except Exception as e:
        print(f"Error during LightGBM GridSearchCV (Baseline): {e}")
        traceback.print_exc()
        model_lgbm_baseline = None
else:
    print("Skipping LightGBM tuning due to lack of training data.")


--- Tuning LightGBM Model (Baseline) ---
Starting LightGBM GridSearchCV (this may take a while)...
Fitting 5 folds for each of 3888 candidates, totalling 19440 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4152
[LightGBM] [Info] Number of data points in the train set: 5832, number of used features: 20
[LightGBM] [Info] Start training from score 2.132275

Best parameters found for LightGBM (Baseline): {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.7}
Best MAE score during CV (Baseline): 0.55
Tuned LightGBM Model training complete (Baseline).
Baseline LightGBM model saved to models/2024-25\lgbm_model_baseline.pkl.


In [18]:
# --- Model Evaluation (Tuned LightGBM - Baseline) ---
train_cols = X_train.columns if X_train is not None else None
if 'model_lgbm_baseline' in locals() and model_lgbm_baseline is not None:
    evaluate_model(model_lgbm_baseline, X_test, y_test, y_test_original, "Tuned LightGBM (Baseline)", X_train_cols=train_cols) 
else:
    print("Skipping Baseline LightGBM evaluation as model was not trained.")


--- Tuned LightGBM (Baseline) Evaluation ---
Mean Absolute Error (MAE): 4.82
Root Mean Squared Error (RMSE): 6.61
Mean Absolute Percentage Error (MAPE): 49.63% (excluding 145 games with 0 actual points)
Accuracy (within +/- 3 points): 44.65%

Sample Predictions vs Actual (Tuned LightGBM (Baseline)):
      PTS_EWMA_3  PTS_EWMA_5  MIN_EWMA_3  MIN_EWMA_5  FGA_EWMA_3  FGA_EWMA_5  \
1056   20.392953   18.781173   33.005499   31.911444   14.186562   13.632315   
5907   21.141338   16.250644   27.370876   21.757188   12.919287   10.250668   
866    16.532715   17.091862   32.554688   33.324296   11.227539   12.216747   
3613   31.656860   27.842186   35.452789   34.666517   18.266296   16.386177   
3035    6.310892    6.759531   14.724441   15.936651    4.650916    5.245164   

      FTA_EWMA_3  FTA_EWMA_5  TS%_EWMA_3  TS%_EWMA_5  ...  \
1056    7.878767    6.073409    0.573870    0.568364  ...   
5907    1.067596    0.968590    0.616145    0.533709  ...   
866     3.194336    3.075880    0.

In [19]:
# --- Train and Evaluate Segmented Models ---

if 'X_train_scorers' in locals() and X_train_scorers is not None and not X_train_scorers.empty and \
   'X_train_role' in locals() and X_train_role is not None and not X_train_role.empty:
    
    # Train/Evaluate Scorers Segment
    model_ridge_scorers, model_xgb_scorers, model_lgbm_scorers = train_evaluate_segment(
        X_train_scorers, y_train_scorers, X_test_scorers, y_test_scorers_original, "Scorers"
    )
    
    # Train/Evaluate Role Players Segment
    model_ridge_role, model_xgb_role, model_lgbm_role = train_evaluate_segment(
        X_train_role, y_train_role, X_test_role, y_test_role_original, "Role Players"
    )

else:
    print("Skipping segmented model training as segmentation did not run successfully or segments are empty.")
    model_ridge_scorers, model_xgb_scorers, model_lgbm_scorers = [None]*3
    model_ridge_role, model_xgb_role, model_lgbm_role = [None]*3



--- Tuning Ridge Regression Model (Scorers) ---
Best alpha found for Ridge (Scorers): 0.1
Tuned Ridge Regression Model training complete (Scorers).
Ridge model saved for Scorers to models/2024-25\ridge_model_scorers.pkl.

--- Tuned Ridge (Scorers) Evaluation ---
Mean Absolute Error (MAE): 6.52
Root Mean Squared Error (RMSE): 8.29
Mean Absolute Percentage Error (MAPE): 42.43% (excluding 1 games with 0 actual points)
Accuracy (within +/- 3 points): 31.55%

Sample Predictions vs Actual (Tuned Ridge (Scorers)):
      PTS_EWMA_3  PTS_EWMA_5  MIN_EWMA_3  MIN_EWMA_5  FGA_EWMA_3  FGA_EWMA_5  \
1056   20.392953   18.781173   33.005499   31.911444   14.186562   13.632315   
866    16.532715   17.091862   32.554688   33.324296   11.227539   12.216747   
3613   31.656860   27.842186   35.452789   34.666517   18.266296   16.386177   
4727   21.397888   22.051085   34.510620   35.595225   19.166687   19.985165   
2168   26.069102   26.226446   32.327177   31.900167   18.334556   18.841405   

    

In [27]:
# --- Combined Segmented Model Evaluation (Hybrid: Role Player Ridge + Baseline XGBoost) ---

print(f"\n--- Evaluating Combined HYBRID Model ({SEASON}) ---")
print("--- (Role Player Ridge + Baseline XGBoost) ---")

if 'X_test' not in locals() or X_test is None or X_test.empty or \
   'y_test_original' not in locals() or y_test_original is None or y_test_original.empty or \
   'test_scorer_mask_global' not in locals() or test_scorer_mask_global is None:

    print("Error: Essential test data or segmentation masks not found. Please ensure previous cells have run successfully.")

else:
    try:
        # --- Load the required models ---
        print("Loading required models...")
        model_role_path = os.path.join(MODEL_DIR, 'ridge_model_role_players.pkl')
        model_scorer_path = os.path.join(MODEL_DIR, 'xgb_model_baseline.pkl') 
        
        model_role_loaded = joblib.load(model_role_path)
        model_scorer_loaded = joblib.load(model_scorer_path) # Load BASELINE XGB for scorers
        print("Required models loaded successfully.")

        # --- Generate Combined Predictions ---
        print("Generating combined HYBRID predictions...")
        scorer_mask = test_scorer_mask_global
        role_mask = test_role_mask_global
        n_scorers_test = scorer_mask.sum()
        n_role_test = role_mask.sum()
        print(f"Test Set Segmentation (using masks): Scorers = {n_scorers_test}, Role Players = {n_role_test}")

        if n_scorers_test == 0 and n_role_test == 0:
             print("Error: Both test segments are empty based on the mask. Cannot proceed.")
        else:
            y_pred_combined_transformed = pd.Series(np.zeros(len(y_test_original), dtype=float), index=y_test_original.index)
            if n_scorers_test > 0:
                 X_test_scorers_segment = X_test[scorer_mask]
                 y_pred_combined_transformed[scorer_mask] = model_scorer_loaded.predict(X_test_scorers_segment)
            if n_role_test > 0:
                 X_test_role_segment = X_test[role_mask]
                 y_pred_combined_transformed[role_mask] = model_role_loaded.predict(X_test_role_segment)
            print("Combined transformed HYBRID predictions generated.")

            # --- Transform Combined Predictions Back to Original Scale ---
            y_pred_combined_original = np.expm1(y_pred_combined_transformed)
            y_pred_combined_original[y_pred_combined_original < 0] = 0
            print("Combined HYBRID predictions transformed back to original scale.")

            # --- Calculate Combined Metrics Directly (using original scale) ---
            print("Calculating combined HYBRID metrics...")
            combined_mae = mean_absolute_error(y_test_original, y_pred_combined_original)
            combined_mse = mean_squared_error(y_test_original, y_pred_combined_original)
            combined_rmse = np.sqrt(combined_mse)
            non_zero_mask_combined = y_test_original != 0
            if np.any(non_zero_mask_combined):
                combined_mape = mean_absolute_percentage_error(y_test_original[non_zero_mask_combined], y_pred_combined_original[non_zero_mask_combined])
                combined_mape_note = f"(excluding {(~non_zero_mask_combined).sum()} games with 0 actual points)"
            else:
                combined_mape = np.nan 
                combined_mape_note = "(MAPE not calculable)"
            combined_within_3_pts_accuracy = (np.abs(y_test_original - y_pred_combined_original) <= 3).mean() * 100
            
            # --- Print Comparison ---
            print(f"\n--- METRICS COMPARISON ({SEASON}) ---")
            print("\nCombined HYBRID Model (Role Ridge + Baseline XGB):")
            print(f"  Mean Absolute Error (MAE): {combined_mae:.2f}")
            print(f"  Root Mean Squared Error (RMSE): {combined_rmse:.2f}")
            print(f"  Mean Absolute Percentage Error (MAPE): {combined_mape:.2%} {combined_mape_note}")
            print(f"  Accuracy (within +/- 3 points): {combined_within_3_pts_accuracy:.2f}%")

            # Re-evaluate Baseline XGBoost Model for direct comparison
            print("\nBaseline XGBoost Model (Re-evaluated on original scale):") 
            if 'model_xgb_baseline' in locals() and model_xgb_baseline is not None:
                 evaluate_model(model_xgb_baseline, X_test, y_test, y_test_original, "Baseline XGB (for comparison)")
            else:
                 print("  Baseline XGBoost model ('model_xgb_baseline') not found in memory. Cannot re-evaluate.")
                 # Add fallback load if needed:
                 # try:
                 #    model_xgb_baseline_loaded = joblib.load(os.path.join(MODEL_DIR, 'xgb_model_baseline.pkl')) 
                 #    evaluate_model(model_xgb_baseline_loaded, X_test, y_test, y_test_original, "Baseline XGB (Loaded for comparison)")
                 # except FileNotFoundError:
                 #    print(f"  Could not load '{os.path.join(MODEL_DIR, 'xgb_model_baseline.pkl')}' for comparison.")

    except FileNotFoundError as e:
        print(f"Error loading required model for hybrid evaluation: {e}.")
        traceback.print_exc()
    except Exception as e:
        print(f"An unexpected error occurred during combined hybrid evaluation: {e}")
        traceback.print_exc()



--- Evaluating Combined HYBRID Model (2024-25) ---
--- (Role Player Ridge + Baseline XGBoost) ---
Loading required models...
Required models loaded successfully.
Generating combined HYBRID predictions...
Test Set Segmentation (using masks): Scorers = 393, Role Players = 1065
Combined transformed HYBRID predictions generated.
Combined HYBRID predictions transformed back to original scale.
Calculating combined HYBRID metrics...

--- METRICS COMPARISON (2024-25) ---

Combined HYBRID Model (Role Ridge + Baseline XGB):
  Mean Absolute Error (MAE): 4.86
  Root Mean Squared Error (RMSE): 6.65
  Mean Absolute Percentage Error (MAPE): 49.31% (excluding 145 games with 0 actual points)
  Accuracy (within +/- 3 points): 45.34%

Baseline XGBoost Model (Re-evaluated on original scale):

--- Baseline XGB (for comparison) Evaluation ---
Mean Absolute Error (MAE): 4.82
Root Mean Squared Error (RMSE): 6.60
Mean Absolute Percentage Error (MAPE): 49.69% (excluding 145 games with 0 actual points)
Accuracy