In [1]:
import pandas as pd
from datetime import date, timedelta
import nba_api.stats.endpoints as endpoints
from nba_api.live.nba.endpoints import boxscore

### Helper Functions
- Testing for these functions in data_extraction/nba-api-demo.ipynb

In [2]:
from nba_api.stats.endpoints import boxscoresummaryv2

def get_home_away_team(game_id):
    """
    Retrieves the home and away team IDs for a given NBA game.

    Parameters:
    game_id (str): The unique identifier for the game.

    Returns:
    tuple: A pair of integers representing the home team ID and away team ID.
    """

    boxscore = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id)
    game_data = boxscore.get_data_frames()[0]  # Game summary data

    home_team_id = game_data['HOME_TEAM_ID'].iloc[0]  # Extract home team ID
    away_team_id = game_data['VISITOR_TEAM_ID'].iloc[0]  # Extract away team ID

    return home_team_id, away_team_id

In [3]:
def get_season(game_id):
    """
    Determines the NBA season based on the given game_id.

    The game_id follows a pattern where:
    - A game_id starting with '00246' to '00299' corresponds to seasons from 1946-47 to 1999-00.
    - A game_id starting with '00200' to '00224' corresponds to seasons from 2000-01 to 2024-25.

    Logic:
    - Extract the 4th and 5th digits of game_id (game_year).
    - If game_year is between 46 and 99, it belongs to the 1900s (1946-47 to 1999-00).
    - Otherwise, it belongs to the 2000s (2000-01 onward).
    - The output is formatted as "YYYY-YY", where YY represents the last two digits of the next year.

    Parameters:
    game_id (str): The unique identifier for the game.

    Returns:
    string: The NBA season in the format 'YYYY-YY' (e.g., '1999-00').
    """

    game_year = int(game_id[3:5])  # Extracts the season identifier
    if 46 <= game_year <= 99:
        start_year = game_year + 1900
    else:
        start_year = game_year + 2000

    end_year_short = (start_year + 1) % 100  # Get last two digits of the next year
    return f"{start_year}-{end_year_short:02d}"

### Populating the Teams Table

In [4]:
def fill_teams_df(game_id, team_stats, teams_df):

    season_year = get_season(game_id)
    team_ids = team_stats['TEAM_ID'].unique()

    # Get home team data
    team_one_row = team_stats[team_stats['TEAM_ID'] == team_ids[0]].iloc[0]
    team_one_location = team_one_row['TEAM_CITY']
    team_one_name = team_one_row['TEAM_NAME']
    team_one_abbrev = team_one_row['TEAM_ABBREVIATION']

    # Get away team data
    team_two_row = team_stats[team_stats['TEAM_ID'] == team_ids[1]].iloc[0]
    team_two_location = team_two_row['TEAM_CITY']
    team_two_name = team_two_row['TEAM_NAME']
    team_two_abbrev = team_two_row['TEAM_ABBREVIATION']

    # Convert the data to a data frame and concatenate it with the existing teams_df
    new_rows = pd.DataFrame([
        {'team_id': team_ids[0], 'season_year': season_year,
        'team_location': team_one_location, 'team_name': team_one_name,
        'team_abbreviation': team_one_abbrev},

        {'team_id': team_ids[1], 'season_year': season_year,
        'team_location': team_two_location, 'team_name': team_two_name,
        'team_abbreviation': team_two_abbrev}
    ])

    # Ensure uniqueness before concatenation (set lookup is O(1) time complexity)
    existing_keys = set(zip(teams_df['team_id'], teams_df['season_year']))
    new_rows_filtered = new_rows[~new_rows.apply(lambda row: (row['team_id'], row['season_year']) in existing_keys, axis=1)]

    # Concatenate only if new unique rows exist
    if not new_rows_filtered.empty:
        teams_df = pd.concat([teams_df, new_rows_filtered], ignore_index=True)

    return teams_df

In [5]:
def fill_players_df(player_stats, players_df):

    player_ids = player_stats['PLAYER_ID'].unique()

    for player_id in player_ids:
        # Get the player data
        player_row = player_stats[player_stats['PLAYER_ID'] == player_id].iloc[0]
        full_name = player_row['PLAYER_NAME']
        name_parts = full_name.split(" ", 1)  # Split at the first space
        player_first_name = name_parts[0]  # First name (everything before the first space)
        player_last_name = name_parts[1] if len(name_parts) > 1 else ""  # Last name (everything after), or empty if no space

        # Convert the data to a data frame and concatenate it with the existing players_df
        new_row = pd.DataFrame([
            {'player_id': player_id,
            'player_first_name': player_first_name, 'player_last_name': player_last_name
            },
        ])

        # Ensure uniqueness before concatenation (set lookup is O(1) time complexity)
        existing_keys = set(zip(players_df['player_id']))
        new_rows_filtered = new_row[~new_row.apply(lambda row: (row['player_id']) in existing_keys, axis=1)]

        # Concatenate only if new unique rows exist
        if not new_rows_filtered.empty:
            players_df = pd.concat([players_df, new_rows_filtered], ignore_index=True)

    return players_df

In [6]:
def fill_games_df(game_id, game_date, games_df):
    season_year = get_season(game_id)
    home_team_id, away_team_id = get_home_away_team(game_id)

    # Convert the data to a data frame and concatenate it with the existing games_df
    new_row = pd.DataFrame([
        {'game_id': game_id, 'season_year': season_year, 'game_date': game_date,
        'home_team_id': home_team_id, 'away_team_id': away_team_id}
    ])

    # Ensure uniqueness before concatenation (set lookup is O(1) time complexity)
    existing_keys = set(zip(games_df['game_id'], games_df['season_year']))
    new_rows_filtered = new_row[~new_row.apply(lambda row: (row['game_id'], row['season_year']) in existing_keys, axis=1)]

    # Concatenate only if new unique rows exist
    if not new_rows_filtered.empty:
        games_df = pd.concat([games_df, new_rows_filtered], ignore_index=True)

    return games_df

In [7]:
def get_player_game_stats(game_id, player_stats, player_game_stats_df):
    player_ids = player_stats['PLAYER_ID'].unique()

    columns_to_keep = [
        "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", 
        "FTM", "FTA", "FT_PCT", "OREB", "DREB", "REB", "AST", "STL", 
        "BLK", "TO", "PF", "PTS", "PLUS_MINUS"
    ]

    for player_id in player_ids:
        player_row = player_stats[player_stats['PLAYER_ID'] == player_id].iloc[0]

        # Get the player stats in JSON format
        player_stats_json = player_row.to_dict()

        # Filter the data
        player_stats_json = [
            {key: player_stats_json[key] for key in columns_to_keep if key in player_stats_json} 
        ]

        # Convert the data to a data frame and concatenate it with the existing player_game_stats_df
        new_row = pd.DataFrame([
            {'game_id': game_id, 'player_id': player_id, 'team_id': player_row['TEAM_ID'], 
            'player_game_stats': player_stats_json}
        ])

        # Ensure uniqueness before concatenation (set lookup is O(1) time complexity)
        existing_keys = set(zip(player_game_stats_df['game_id'], player_game_stats_df['player_id']))
        new_rows_filtered = new_row[~new_row.apply(lambda row: (row['game_id'], row['player_id']) in existing_keys, axis=1)]

        # Concatenate only if new unique rows exist
        if not new_rows_filtered.empty:
            player_game_stats_df = pd.concat([player_game_stats_df, new_rows_filtered], ignore_index=True)

    return player_game_stats_df

In [None]:
import time
import random
from datetime import date, timedelta
import pandas as pd

# Initialize DataFrames
teams_df = pd.DataFrame(columns=['team_id', 'season_year', 'team_location', 'team_name', 'team_abbreviation'])
players_df = pd.DataFrame(columns=['player_id', 'player_first_name', 'player_last_name'])
games_df = pd.DataFrame(columns=['game_id', 'season_year', 'game_date', 'home_team_id', 'away_team_id'])
player_game_stats_df = pd.DataFrame(columns=['game_id', 'player_id', 'team_id', 'player_game_stats'])

start_date = date(11, 1, 1946)
end_date = date(10, 15, 2018)

current_date = start_date
current_highest_season = ""

# Function to simulate an API request with retries and exponential backoff
def fetch_with_retry(func, *args, **kwargs):
    retries = 20  # Number of retries
    delay = 1  # Initial delay time in seconds
    for attempt in range(retries):
        try:
            # Call the original function with passed arguments and keyword arguments
            return func(*args, **kwargs)
        except Exception as e:
            if attempt < retries - 1:
                print(f"Error: {e}. Retrying in {delay}s...")
                #print the current time
                print(time.strftime('%X %x %Z'))
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                print(f"Failed after {retries} attempts: {e}")
                return None

# Request limit settings
delay_between_requests = 1  # Start with a 60-second delay (1 request per minute)
max_delay = 360  # Maximum delay of 6 minutes (to prevent overwhelming the server)

while current_date <= end_date:
    print(f"Processing games for {current_date}...")
    
    # Add a random sleep time to avoid hitting rate limits
    time.sleep(random.uniform(1.0, 2.0))
    
    # Fetch the games data with retry and exponential backoff
    games = fetch_with_retry(endpoints.scoreboardv2.ScoreboardV2, game_date=current_date)
    
    if games is None:
        # If the fetch failed after retries, skip this date
        start_date += timedelta(days=1)
        continue
    
    try:
        games_data = games.get_data_frames()[0]
        game_ids = games_data['GAME_ID']
        if len(game_ids) > 0:
            current_highest_season = get_season(game_ids[0])  # Assuming get_season is defined somewhere
        else:
            print(f"No games found for {current_date}")
    except Exception as e:
        print(f"Error processing games for {current_date}: {e}. Skipping")
        start_date += timedelta(days=1)
        continue
    
    # Loop through all the game IDs and fetch the box score data
    for game_id in game_ids:
        # Get all stats using BoxScoreTraditionalV2 with retry and exponential backoff
        boxscore = fetch_with_retry(endpoints.boxscoretraditionalv2.BoxScoreTraditionalV2, game_id=game_id)
        
        if boxscore is None:
            print(f"Error fetching stats for game {game_id}. Skipping.")
            start_date += timedelta(days=1)
            continue

        try:
            player_stats = boxscore.get_data_frames()[0]
            team_stats = boxscore.get_data_frames()[1]
            
            teams_df = fill_teams_df(game_id, team_stats, teams_df)
            players_df = fill_players_df(player_stats, players_df)
            games_df = fill_games_df(game_id, current_date, games_df)
            player_game_stats_df = get_player_game_stats(game_id, player_stats, player_game_stats_df)

            teams_df.to_csv(f'teams-old.csv', index=False)
            players_df.to_csv(f'players-old.csv', index=False)
            games_df.to_csv(f'games-old.csv', index=False)
            player_game_stats_df.to_csv(f'player_game_stats-old.csv', index=False)


        except Exception as e:
            print(f"Error processing stats for game {game_id}: {e}. Skipping.")
            start_date += timedelta(days=1)
            continue

    # After processing all games, move to the next 5 days

    # Implement request throttling based on rate limits
    time.sleep(delay_between_requests + random.uniform(0, 1))  # Adding a bit of randomness to avoid clustered requests

    # Apply exponential backoff: increase delay after each request to manage rate limits
    delay_between_requests = min(delay_between_requests, max_delay)

print(teams_df)
print('\n')
print(players_df)
print('\n')
print(games_df)
print('\n')
print(player_game_stats_df)


Processing games for 2018-10-16...
Current highest season: 2018-19
Processing games for 2018-10-17...
Current highest season: 2018-19
Processing games for 2018-11-21...
Current highest season: 2018-19
Processing games for 2018-11-21...
Current highest season: 2018-19
Error: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30). Retrying in 1s...
18:05:21 03/07/25 Eastern Standard Time
Processing games for 2018-11-21...
Current highest season: 2018-19
Error processing stats for game 0021800253: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30). Skipping.
Error: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30). Retrying in 1s...
18:06:27 03/07/25 Eastern Standard Time
Error: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30). Retrying in 2s...
18:06:59 03/07/25 Eastern Standard Time
Error: HTTPSConnectionPool(host='stats.nba.com', port=443): Read