In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from nba_api.stats.static import players
from nba_api.stats.static import teams
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import BoxScoreAdvancedV2, BoxScoreTraditionalV2, BoxScoreSummaryV2
from nba_api.stats.endpoints import LeagueGameLog, PlayByPlayV2, ScoreboardV2

In [16]:

with open('checkpoints/interim_results.pkl', 'rb') as f:
    data = pickle.load(f)

data = pd.DataFrame(data)

# print(data.columns)

playoff_dates = {'2024-25': '2025-04-19',
                 '2023-24': '2024-04-20',
                 '2022-23': '2023-04-15',
                 '2021-22': '2022-04-16',
                 '2020-21': '2021-05-22',
                 '2019-20': '2020-08-17'
               }


data['playoff'] = data.apply(
    lambda x: 1 if str(x['game_date']) >= playoff_dates[x['season']] else 0, 
    axis = 1
)

In [26]:

games_df = LeagueGameLog(season='2022-23').get_data_frames()[0]
games_df.columns

game_summary = BoxScoreSummaryV2(season='2022-23').get_data_frames()
line_game = game_summary[1]

# advanced_stats = BoxScoreAdvancedV2(game_id='0022201228').get_data_frames()[0]

TypeError: BoxScoreSummaryV2.__init__() got an unexpected keyword argument 'season'

In [11]:
from nba_api.stats.endpoints import LeagueGameLog, BoxScoreSummaryV2, BoxScoreTraditionalV2
import pandas as pd
import numpy as np
from datetime import datetime
import time

def get_simple_basketball_watch_index(season, num_games=None):
    """
    Create a basketball watch index using only the most reliable data sources.
    Uses LeagueGameLog as the primary data source which has the scores.
    
    Parameters:
    ----------
    season : str
        Season in format 'YYYY-YY' (e.g., '2022-23')
    num_games : int
        Number of games to analyze
        
    Returns:
    -------
    pd.DataFrame
        DataFrame with watch index and component metrics
    """
    print(f"Getting game data for {season} season...")
    # Get all games for the season
    games_df = LeagueGameLog(season=season).get_data_frames()[0]
    
    # Process the data to identify unique games
    games_df['GAME_DATE_DT'] = pd.to_datetime(games_df['GAME_DATE'])
    games_df['IS_HOME'] = games_df['MATCHUP'].str.contains('vs')
    
    # Create a unique game identifier
    games_df['UNIQUE_GAME'] = games_df['GAME_ID'] + "_" + games_df['TEAM_ID'].astype(str)
    
    # Get the most recent games first
    games_df = games_df.sort_values('GAME_DATE_DT', ascending=False)
    
    # Get unique game IDs (each game appears twice in the data, once for each team)
    unique_game_ids = games_df['GAME_ID'].unique()
    
    # Limit to the specified number of games
    if num_games and num_games < len(unique_game_ids):
        unique_game_ids = unique_game_ids[:num_games]
    
    results = []
    
    for i, game_id in enumerate(unique_game_ids):
        try:
            print(f"Processing game {i+1}/{len(unique_game_ids)}: {game_id}")
            
            # Get the rows for this game (home and away)
            game_rows = games_df[games_df['GAME_ID'] == game_id]
            
            if len(game_rows) != 2:
                print(f"Unexpected number of rows for game {game_id}: {len(game_rows)}")
                continue
            
            # Separate home and away teams
            home_row = game_rows[game_rows['IS_HOME']].iloc[0] if any(game_rows['IS_HOME']) else None
            away_row = game_rows[~game_rows['IS_HOME']].iloc[0] if any(~game_rows['IS_HOME']) else None
            
            # If we can't identify home/away, skip this game
            if home_row is None or away_row is None:
                print(f"Could not identify home/away teams for game {game_id}")
                continue
            
            # Extract basic game info
            home_team = home_row['TEAM_ABBREVIATION']
            away_team = away_row['TEAM_ABBREVIATION']
            home_score = int(home_row['PTS'])
            away_score = int(away_row['PTS'])
            game_date = home_row['GAME_DATE']
            
            # Try to get additional data from BoxScoreSummary
            line_score = None
            try:
                summary = BoxScoreSummaryV2(game_id=game_id).get_data_frames()
                if len(summary) >= 2:
                    line_score = summary[1]
            except Exception as e:
                print(f"Could not get box score summary: {e}")
            
            # Initialize metrics with default values
            lead_changes = 0
            times_tied = 0
            largest_lead = 0
            fast_break_pts = 0
            paint_pts = 0
            to_pts = 0
            
            # Extract metrics from line_score if available
            if line_score is not None:
                try:
                    lead_changes = line_score['LEAD_CHANGES'].iloc[0] if 'LEAD_CHANGES' in line_score.columns else 0
                    times_tied = line_score['TIMES_TIED'].iloc[0] if 'TIMES_TIED' in line_score.columns else 0
                    largest_lead = line_score['LARGEST_LEAD'].max() if 'LARGEST_LEAD' in line_score.columns else 0
                    fast_break_pts = line_score['PTS_FB'].sum() if 'PTS_FB' in line_score.columns else 0
                    paint_pts = line_score['PTS_PAINT'].sum() if 'PTS_PAINT' in line_score.columns else 0
                    to_pts = line_score['PTS_OFF_TO'].sum() if 'PTS_OFF_TO' in line_score.columns else 0
                except Exception as e:
                    print(f"Could not extract lead changes from line_score: {e}")


            # Avoid hitting API rate limits
            time.sleep(0.6)
            
            # Try to get advanced stats on the top player
            advanced_stats = None
            try:
                advanced_stats = BoxScoreAdvancedV2(game_id=game_id).get_data_frames()[0]
            except Exception as e:
                print(f"Could not get advanced stats: {e}")
            
            # Initialize additional metrics with default values
            star = 0
            
            # Extract metrics from advanced_stats if available
            if advanced_stats is not None:
                try:
                    advanced_stats['Star'] = advanced_stats['USG_PCT'] * advanced_stats['TS_PCT']
                    star = advanced_stats['Star'].sum() if 'Star' in advanced_stats.columns else 0
                except Exception as e:
                    print(f"Could not extract metrics from advanced_stats: {e}")

            
            # Avoid hitting API rate limits
            time.sleep(0.6)

            # Try to get traditional stats for 3-pointers and other stats
            traditional_stats = None
            try:
                traditional_stats = BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[0]
            except Exception as e:
                print(f"Could not get traditional stats: {e}")
            
            # Initialize additional metrics with default values
            threes_made = 0
            threes_attempted = 0
            steals = 0
            blocks = 0
            
            # Extract metrics from traditional_stats if available
            if traditional_stats is not None:
                try:
                    threes_made = traditional_stats['FG3M'].sum() if 'FG3M' in traditional_stats.columns else 0
                    threes_attempted = traditional_stats['FG3A'].sum() if 'FG3A' in traditional_stats.columns else 0
                    steals = traditional_stats['STL'].sum() if 'STL' in traditional_stats.columns else 0
                    blocks = traditional_stats['BLK'].sum() if 'BLK' in traditional_stats.columns else 0
                except Exception as e:
                    print(f"Could not extract metrics from traditional_stats: {e}")
            
            # Calculate derived metrics
            total_score = home_score + away_score
            score_diff = abs(home_score - away_score)
            closeness = 1 - (score_diff / total_score if total_score > 0 else 0)
            three_pt_pct = threes_made / threes_attempted if threes_attempted > 0 else 0
            
            # Determine if it was a close game at the end
            clutch_ending = 1 if score_diff <= 5 else 0
            
            # Determine if it went to overtime
            # Check both teams' matchup info for OT indicator
            overtime = 1 if "OT" in home_row['MATCHUP'] or "OT" in away_row['MATCHUP'] else 0
            
            # Create game data dictionary
            game_data = {
                'game_id': game_id,
                'game_date': game_date,
                'home_team': home_team,
                'away_team': away_team,
                'home_score': home_score,
                'away_score': away_score,
                'total_score': total_score,
                'score_diff': score_diff,
                'closeness': closeness,
                'lead_changes': lead_changes,
                'times_tied': times_tied,
                'largest_lead': largest_lead,
                'threes_made': threes_made,
                'three_pt_pct': three_pt_pct,
                'fast_break_pts': fast_break_pts,
                'paint_pts': paint_pts,
                'to_pts': to_pts,
                'star': star,
                'steals': steals,
                'blocks': blocks,
                'clutch_ending': clutch_ending,
                'overtime': overtime
            }
            
            results.append(game_data)
            
            # Avoid hitting API rate limits
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing game {game_id if 'game_id' in locals() else 'unknown'}: {e}")
    
    # Create DataFrame from results
    df = pd.DataFrame(results)
    
    if len(df) == 0:
        print("No valid games found.")
        return df
    
    # Calculate percentile ranks for key metrics
    rank_columns = [
        'total_score', 'closeness', 'lead_changes', 'times_tied',
        'threes_made', 'three_pt_pct', 'steals', 'blocks',
        'clutch_ending', 'overtime', 'star', 'fast_break_pts', 'paint_pts', 'to_pts'
    ]
    
    # Create percentile ranks
    for col in rank_columns:
        if col in df.columns and df[col].notna().any():
            df[f'PR_{col}'] = df[col].rank(pct=True)
        else:
            df[f'PR_{col}'] = 0.5
    
    # Calculate Watch Index components - simpler version
    df['Scoring'] = (df['PR_total_score'] + df['PR_threes_made'] + df['PR_three_pt_pct'] + df['PR_star'] * 3 + df['PR_paint_pts'] * 2) / 8
    
    df['Competitiveness'] = (
        df['PR_closeness'] * 10 +
        df['PR_lead_changes'] * 4 + 
        df['PR_times_tied'] + 
        df['PR_clutch_ending'] * 4 + 
        df['PR_overtime'] * 2
    ) / 21
    
    df['Highlights'] = (df['PR_steals'] + df['PR_blocks'] + df['PR_fast_break_pts'] * 2 + df['PR_to_pts']) / 5
    
    # Final Watch Index
    df['WatchIndex'] = (
        df['Scoring'] * 3 + 
        df['Competitiveness'] * 7 + 
        df['Highlights'] * 2
    ) / 12
    
    # Sort by Watch Index
    df = df.sort_values('WatchIndex', ascending=False).reset_index(drop=True)
    
    return df


# Example usage
if __name__ == "__main__":
    # Test with a small number of games from the 2022-23 season
    watch_index = get_simple_basketball_watch_index('2024-25', num_games=10)
    
    print("\nTop Games by Watch Index:")
    if len(watch_index) > 0:
        print(watch_index[['game_date', 'home_team', 'away_team', 'home_score', 'away_score', 
                          'lead_changes', 'closeness', 'WatchIndex']].head())
    else:
        print("No valid games found.")

Getting game data for 2024-25 season...
Processing game 1/10: 0022401193
Could not extract lead changes from line_score: single positional indexer is out-of-bounds
Processing game 2/10: 0022401197
Could not extract lead changes from line_score: single positional indexer is out-of-bounds
Processing game 3/10: 0022401190
Could not extract lead changes from line_score: single positional indexer is out-of-bounds
Processing game 4/10: 0022401194
Could not extract lead changes from line_score: single positional indexer is out-of-bounds
Processing game 5/10: 0022401200
Could not extract lead changes from line_score: single positional indexer is out-of-bounds
Processing game 6/10: 0022401199
Could not extract lead changes from line_score: single positional indexer is out-of-bounds


KeyboardInterrupt: 

In [28]:
from nba_api.stats.endpoints import LeagueGameLog, BoxScoreSummaryV2, BoxScoreTraditionalV2, BoxScoreAdvancedV2
import pandas as pd
import numpy as np
from datetime import datetime
import time
import os
import json
import pickle
import random
from pathlib import Path

def get_multiple_seasons_watch_index(seasons=None, num_games_per_season=None, checkpoint_dir="checkpoints"):
    """
    Create a basketball watch index for multiple seasons with checkpointing.
    
    Parameters:
    ----------
    seasons : list
        List of seasons in format ['YYYY-YY'] (e.g., ['2022-23', '2021-22'])
        If None, will use the last 5 seasons
    num_games_per_season : int or None
        Number of games to analyze per season. If None, all games.
    checkpoint_dir : str
        Directory to store checkpoint files
        
    Returns:
    -------
    pd.DataFrame
        DataFrame with watch index and component metrics for all seasons
    """
    # If no seasons provided, use the last 5 seasons
    if seasons is None:
        current_year = datetime.now().year
        month = datetime.now().month
        # If we're between January and September, the current season is previous year
        if 1 <= month <= 9:
            current_year -= 1
        
        seasons = []
        for i in range(5):
            season_start = current_year - i
            season_end = str(season_start + 1)[-2:]
            seasons.append(f"{season_start}-{season_end}")
    
    # Create checkpoint directory if it doesn't exist
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
    
    # Master results dataframe
    all_results = pd.DataFrame()
    
    # Process each season
    for season in seasons:
        print(f"\n=== Processing season {season} ===")
        
        # Check if we already have results for this season
        season_file = os.path.join(checkpoint_dir, f"watch_index{season}.csv")
        if os.path.exists(season_file):
            print(f"Found existing data for {season}, loading...")
            season_df = pd.read_csv(season_file)
            all_results = pd.concat([all_results, season_df], ignore_index=True)
            print(f"Loaded {len(season_df)} games from {season}")
            continue
        
        season_df = process_single_season(season, num_games_per_season, checkpoint_dir)
        
        # Save the season results
        if len(season_df) > 0:
            season_df.to_csv(season_file, index=False)
            print(f"Saved {len(season_df)} games for {season} to {season_file}")
            all_results = pd.concat([all_results, season_df], ignore_index=True)
    
    # Sort the final dataframe by watch index
    if len(all_results) > 0:
        all_results = all_results.sort_values('WatchIndex', ascending=False).reset_index(drop=True)
    
    # Save the complete dataset
    all_results_file = os.path.join(checkpoint_dir, "watch_index_all_seasons.csv")
    all_results.to_csv(all_results_file, index=False)
    print(f"Saved complete dataset with {len(all_results)} games to {all_results_file}")
    
    return all_results

def process_single_season(season, num_games=None, checkpoint_dir="checkpoints"):
    """
    Process a single season with checkpointing capabilities.
    
    Parameters:
    ----------
    season : str
        Season in format 'YYYY-YY' (e.g., '2022-23')
    num_games : int or None
        Number of games to analyze. If None, all games.
    checkpoint_dir : str
        Directory to store checkpoint files
    
    Returns:
    -------
    pd.DataFrame
        DataFrame with watch index for the season
    """
    # Define checkpoint file paths
    games_list_file = os.path.join(checkpoint_dir, f"games_list{season}.pkl")
    processed_games_file = os.path.join(checkpoint_dir, f"processed_games{season}.json")
    interim_results_file = os.path.join(checkpoint_dir, f"interim_results{season}.pkl")
    
    # Check if we have a list of games already
    games_df = None
    unique_game_ids = []
    
    if os.path.exists(games_list_file):
        print(f"Loading existing games list for {season}...")
        with open(games_list_file, 'rb') as f:
            games_df = pickle.load(f)
        unique_game_ids = games_df['GAME_ID'].unique()
        print(f"Loaded {len(unique_game_ids)} unique games")
    else:
        try:
            print(f"Getting game data for {season} season...")
            # Get all games for the season
            games_df = LeagueGameLog(season=season).get_data_frames()[0]
            
            # Process the data to identify unique games
            games_df['GAME_DATE_DT'] = pd.to_datetime(games_df['GAME_DATE'])
            games_df['IS_HOME'] = games_df['MATCHUP'].str.contains('vs')
            
            # Create a unique game identifier
            games_df['UNIQUE_GAME'] = games_df['GAME_ID'] + "_" + games_df['TEAM_ID'].astype(str)
            
            # Get the most recent games first
            games_df = games_df.sort_values('GAME_DATE_DT', ascending=False)
            
            # Get unique game IDs (each game appears twice in the data, once for each team)
            unique_game_ids = games_df['GAME_ID'].unique()
            
            # Save games list to checkpoint
            with open(games_list_file, 'wb') as f:
                pickle.dump(games_df, f)
            print(f"Saved {len(unique_game_ids)} unique games list to checkpoint")
            
        except Exception as e:
            print(f"Error getting game data for {season}: {e}")
            return pd.DataFrame()
    
    # Limit to the specified number of games if needed
    if num_games and num_games < len(unique_game_ids):
        unique_game_ids = unique_game_ids[:num_games]
    
    # Load the set of already processed games
    processed_games = set()
    if os.path.exists(processed_games_file):
        with open(processed_games_file, 'r') as f:
            processed_games = set(json.load(f))
        print(f"Found {len(processed_games)} already processed games")
    
    # Load interim results if they exist
    results = []
    if os.path.exists(interim_results_file):
        with open(interim_results_file, 'rb') as f:
            results = pickle.load(f)
        print(f"Loaded {len(results)} interim results")
    
    # Identify games still to process
    games_to_process = [g for g in unique_game_ids if g not in processed_games]
    print(f"{len(games_to_process)} games left to process for {season}")
    
    # Process each game
    for i, game_id in enumerate(games_to_process):
        success = False
        retries = 0
        max_retries = 3
        
        while not success and retries < max_retries:
            try:
                print(f"Processing game {i+1}/{len(games_to_process)}: {game_id} (Attempt {retries+1})")
                
                # Get the rows for this game (home and away)
                game_rows = games_df[games_df['GAME_ID'] == game_id]
                
                if len(game_rows) != 2:
                    print(f"Unexpected number of rows for game {game_id}: {len(game_rows)}")
                    success = True  # Mark as success to move on, even though we're skipping
                    continue
                
                # Separate home and away teams
                home_row = game_rows[game_rows['IS_HOME']].iloc[0] if any(game_rows['IS_HOME']) else None
                away_row = game_rows[~game_rows['IS_HOME']].iloc[0] if any(~game_rows['IS_HOME']) else None
                
                # If we can't identify home/away, skip this game
                if home_row is None or away_row is None:
                    print(f"Could not identify home/away teams for game {game_id}")
                    success = True  # Mark as success to move on
                    continue
                
                # Extract basic game info
                home_team = home_row['TEAM_ABBREVIATION']
                away_team = away_row['TEAM_ABBREVIATION']
                home_score = int(home_row['PTS'])
                away_score = int(away_row['PTS'])
                game_date = home_row['GAME_DATE']
                
                # Initialize metrics with default values
                lead_changes = 0
                times_tied = 0
                largest_lead = 0
                fast_break_pts = 0
                paint_pts = 0
                to_pts = 0
                star = 0
                threes_made = 0
                threes_attempted = 0
                steals = 0
                blocks = 0
                
                # Try to get additional data from BoxScoreSummary with error handling
                try:
                    summary = BoxScoreSummaryV2(game_id=game_id).get_data_frames()
                    if len(summary) >= 2:
                        line_score = summary[1]
                        # Extract metrics from line_score
                        lead_changes = line_score['LEAD_CHANGES'].iloc[0] if 'LEAD_CHANGES' in line_score.columns else 0
                        times_tied = line_score['TIMES_TIED'].iloc[0] if 'TIMES_TIED' in line_score.columns else 0
                        largest_lead = line_score['LARGEST_LEAD'].max() if 'LARGEST_LEAD' in line_score.columns else 0
                        fast_break_pts = line_score['PTS_FB'].sum() if 'PTS_FB' in line_score.columns else 0
                        paint_pts = line_score['PTS_PAINT'].sum() if 'PTS_PAINT' in line_score.columns else 0
                        to_pts = line_score['PTS_OFF_TO'].sum() if 'PTS_OFF_TO' in line_score.columns else 0
                except Exception as e:
                    if str(e) == 'single positional indexer is out-of-bounds':
                        print(f"Error getting traditional stats: {e}")
                    
                    else:
                        raise ValueError(f"Error getting summary stats: {e}")
                
                # Add jitter to avoid consistent API rate limit hits
                time.sleep(random.uniform(0.8, 1.5))
                
                # Try to get advanced stats
                try:
                    advanced_stats = BoxScoreAdvancedV2(game_id=game_id).get_data_frames()[0]
                    if advanced_stats is not None and len(advanced_stats) > 0:
                        # Make sure required columns exist
                        if 'USG_PCT' in advanced_stats.columns and 'TS_PCT' in advanced_stats.columns:
                            advanced_stats['Star'] = advanced_stats['USG_PCT'] * advanced_stats['TS_PCT']
                            star = advanced_stats['Star'].sum()
                except Exception as e:
                    if str(e) == 'single positional indexer is out-of-bounds':
                        print(f"Error getting traditional stats: {e}")
                    
                    else:
                        raise ValueError(f"Error getting advanced stats: {e}")
                
                # Add jitter to avoid consistent API rate limit hits
                time.sleep(random.uniform(0.8, 1.5))
                
                # Try to get traditional stats
                try:
                    traditional_stats = BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[0]
                    if traditional_stats is not None and len(traditional_stats) > 0:
                        threes_made = traditional_stats['FG3M'].sum() if 'FG3M' in traditional_stats.columns else 0
                        threes_attempted = traditional_stats['FG3A'].sum() if 'FG3A' in traditional_stats.columns else 0
                        steals = traditional_stats['STL'].sum() if 'STL' in traditional_stats.columns else 0
                        blocks = traditional_stats['BLK'].sum() if 'BLK' in traditional_stats.columns else 0
                except Exception as e:
                    if str(e) == 'single positional indexer is out-of-bounds':
                        print(f"Error getting traditional stats: {e}")
                    
                    else:
                        raise ValueError(f"Error getting traditional stats: {e}")
                
                # Calculate derived metrics
                total_score = home_score + away_score
                score_diff = abs(home_score - away_score)
                closeness = 1 - (score_diff / total_score if total_score > 0 else 0)
                three_pt_pct = threes_made / threes_attempted if threes_attempted > 0 else 0
                
                # Determine if it was a close game at the end
                clutch_ending = 1 if score_diff <= 5 else 0
                
                # Determine if it went to overtime
                overtime = 1 if "OT" in home_row['MATCHUP'] or "OT" in away_row['MATCHUP'] else 0
                
                # Create game data dictionary
                game_data = {
                    'game_id': game_id,
                    'game_date': game_date,
                    'season': season,
                    'home_team': home_team,
                    'away_team': away_team,
                    'home_score': home_score,
                    'away_score': away_score,
                    'total_score': total_score,
                    'score_diff': score_diff,
                    'closeness': closeness,
                    'lead_changes': lead_changes,
                    'times_tied': times_tied,
                    'largest_lead': largest_lead,
                    'threes_made': threes_made,
                    'three_pt_pct': three_pt_pct,
                    'fast_break_pts': fast_break_pts,
                    'paint_pts': paint_pts,
                    'to_pts': to_pts,
                    'star': star,
                    'steals': steals,
                    'blocks': blocks,
                    'clutch_ending': clutch_ending,
                    'overtime': overtime
                }
                
                results.append(game_data)
                processed_games.add(game_id)
                
                # Update checkpoint files periodically (every 5 games)
                if (i + 1) % 5 == 0 or i == len(games_to_process) - 1:
                    # Save processed games list
                    with open(processed_games_file, 'w') as f:
                        json.dump(list(processed_games), f)
                    
                    # Save interim results
                    with open(interim_results_file, 'wb') as f:
                        pickle.dump(results, f)
                    
                    print(f"Checkpoint saved after {i+1} games")

                    # Add jitter to avoid consistent API rate limit hits
                    time.sleep(random.uniform(5, 10))
                    
                if (i + 1) % 25 == 0 or i == len(games_to_process) - 1:
                    # Add jitter to avoid consistent API rate limit hits
                    time.sleep(3)
                
                # Mark as successful
                success = True
                
                # Add jitter to avoid consistent API rate limit hits
                time.sleep(random.uniform(1.0, 2.0))
                
            except Exception as e:
                retries += 1
                print(f"Error processing game {game_id}: {e}")
                print(f"Retrying in {3**retries} seconds... (Attempt {retries+1}/{max_retries})")
                time.sleep(1**retries)  # Exponential backoff
        
        if not success:
            print(f"Failed to process game {game_id} after {max_retries} attempts")
    
    # Create DataFrame from results
    df = pd.DataFrame(results)
    
    if len(df) == 0:
        print("No valid games found.")
        return df
    
    print(f"Calculating watch index for {len(df)} games from {season}")
    
    # Calculate percentile ranks for key metrics
    rank_columns = [
        'total_score', 'closeness', 'lead_changes', 'times_tied',
        'threes_made', 'three_pt_pct', 'steals', 'blocks',
        'clutch_ending', 'overtime', 'star', 'fast_break_pts', 'paint_pts', 'to_pts'
    ]
    
    # Create percentile ranks
    for col in rank_columns:
        if col in df.columns and df[col].notna().any():
            df[f'PR_{col}'] = df[col].rank(pct=True)
        else:
            df[f'PR_{col}'] = 0.5
    
    # Calculate Watch Index components
    df['Scoring'] = (df['PR_total_score'] + df['PR_threes_made'] + df['PR_three_pt_pct'] + df['PR_star'] * 3 + df['PR_paint_pts'] * 2) / 8
    
    df['Competitiveness'] = (
        df['PR_closeness'] * 10 +
        df['PR_lead_changes'] * 4 + 
        df['PR_times_tied'] + 
        df['PR_clutch_ending'] * 4 + 
        df['PR_overtime'] * 2
    ) / 21
    
    df['Highlights'] = (df['PR_steals'] + df['PR_blocks'] + df['PR_fast_break_pts'] * 2 + df['PR_to_pts']) / 5
    
    # Final Watch Index
    df['WatchIndex'] = (
        df['Scoring'] * 3 + 
        df['Competitiveness'] * 7 + 
        df['Highlights'] * 2
    ) / 12
    
    # Sort by Watch Index
    df = df.sort_values('WatchIndex', ascending=False).reset_index(drop=True)
    
    return df

def get_top_games_by_criteria(df, criteria='WatchIndex', n=10):
    """
    Get top N games by specified criteria
    
    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame with watch index data
    criteria : str
        Column to sort by
    n : int
        Number of games to return
        
    Returns:
    -------
    pd.DataFrame
        Top N games by criteria
    """
    if criteria not in df.columns:
        print(f"Criteria {criteria} not found in data")
        return None
    
    return df.sort_values(criteria, ascending=False).head(n)

# Example usage
if __name__ == "__main__":
    # Get games from the past 5 seasons
    all_seasons_df = get_multiple_seasons_watch_index()
    
    if len(all_seasons_df) > 0:
        # Print top games overall
        print("\nTop 10 Games by Watch Index Across All Seasons:")
        top_games = all_seasons_df[['season', 'game_date', 'home_team', 'away_team', 
                                     'home_score', 'away_score', 'lead_changes', 
                                     'closeness', 'overtime', 'WatchIndex']].head(10)
        print(top_games)
        
        # Print top games by season
        for season in all_seasons_df['season'].unique():
            season_df = all_seasons_df[all_seasons_df['season'] == season]
            print(f"\nTop 5 Games from {season} Season:")
            print(season_df[['game_date', 'home_team', 'away_team', 'home_score', 
                             'away_score', 'lead_changes', 'overtime', 'WatchIndex']].head(5))
        
        # Print some specialty categories
        print("\nMost Competitive Games:")
        print(get_top_games_by_criteria(all_seasons_df, 'Competitiveness', 5)[
            ['season', 'game_date', 'home_team', 'away_team', 'score_diff', 
             'lead_changes', 'overtime', 'Competitiveness', 'WatchIndex']])
        
        print("\nGames with Best Highlights:")
        print(get_top_games_by_criteria(all_seasons_df, 'Highlights', 5)[
            ['season', 'game_date', 'home_team', 'away_team', 
             'steals', 'blocks', 'fast_break_pts', 'Highlights', 'WatchIndex']])
    else:
        print("No valid games found across all seasons.")


=== Processing season 2024-25 ===
Found existing data for 2024-25, loading...
Loaded 1191 games from 2024-25

=== Processing season 2023-24 ===
Found existing data for 2023-24, loading...
Loaded 1080 games from 2023-24

=== Processing season 2022-23 ===
Loading existing games list for 2022-23...
Loaded 1230 unique games
Found 799 already processed games
Loaded 799 interim results
431 games left to process for 2022-23
Processing game 1/431: 0022200437 (Attempt 1)
Error processing game 0022200437: Error getting summary stats: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Retrying in 3 seconds... (Attempt 2/3)
Processing game 1/431: 0022200437 (Attempt 2)
Error processing game 0022200437: Error getting summary stats: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Retrying in 9 seconds... (Attempt 3/3)
Processing game 1/431: 0022200437 (Attempt 3)
Error processing game 0022200437: Error getting summary stats:

KeyboardInterrupt: 

In [27]:
all_seasons_df = get_multiple_seasons_watch_index()
all_seasons_df = get_multiple_seasons_watch_index()
all_seasons_df = get_multiple_seasons_watch_index()


=== Processing season 2024-25 ===
Found existing data for 2024-25, loading...
Loaded 1191 games from 2024-25

=== Processing season 2023-24 ===
Found existing data for 2023-24, loading...
Loaded 1080 games from 2023-24

=== Processing season 2022-23 ===
Loading existing games list for 2022-23...
Loaded 1230 unique games
Found 669 already processed games
Loaded 669 interim results
561 games left to process for 2022-23
Processing game 1/561: 0022201152 (Attempt 1)
Processing game 2/561: 0022201158 (Attempt 1)
Processing game 3/561: 0022201150 (Attempt 1)
Processing game 4/561: 0022201147 (Attempt 1)
Processing game 5/561: 0022201146 (Attempt 1)
Checkpoint saved after 5 games
Processing game 6/561: 0022201144 (Attempt 1)
Processing game 7/561: 0022201141 (Attempt 1)
Processing game 8/561: 0022201142 (Attempt 1)
Processing game 9/561: 0022201137 (Attempt 1)
Processing game 10/561: 0022201143 (Attempt 1)
Checkpoint saved after 10 games
Processing game 11/561: 0022201140 (Attempt 1)
Process

KeyboardInterrupt: 