In [1]:
import pandas as pd
from datetime import date, timedelta
import nba_api.stats.endpoints as endpoints
from nba_api.live.nba.endpoints import boxscore

### Helper Functions
- Testing for these functions in data_extraction/nba-api-demo.ipynb

In [2]:
from nba_api.stats.endpoints import boxscoresummaryv2

def get_home_away_team(game_id):
    """
    Retrieves the home and away team IDs for a given NBA game.

    Parameters:
    game_id (str): The unique identifier for the game.

    Returns:
    tuple: A pair of integers representing the home team ID and away team ID.
    """

    boxscore = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id)
    game_data = boxscore.get_data_frames()[0]  # Game summary data

    home_team_id = game_data['HOME_TEAM_ID'].iloc[0]  # Extract home team ID
    away_team_id = game_data['VISITOR_TEAM_ID'].iloc[0]  # Extract away team ID

    return home_team_id, away_team_id

In [3]:
def get_season(game_id):
    """
    Determines the NBA season based on the given game_id.

    The game_id follows a pattern where:
    - A game_id starting with '00246' to '00299' corresponds to seasons from 1946-47 to 1999-00.
    - A game_id starting with '00200' to '00224' corresponds to seasons from 2000-01 to 2024-25.

    Logic:
    - Extract the 4th and 5th digits of game_id (game_year).
    - If game_year is between 46 and 99, it belongs to the 1900s (1946-47 to 1999-00).
    - Otherwise, it belongs to the 2000s (2000-01 onward).
    - The output is formatted as "YYYY-YY", where YY represents the last two digits of the next year.

    Parameters:
    game_id (str): The unique identifier for the game.

    Returns:
    string: The NBA season in the format 'YYYY-YY' (e.g., '1999-00').
    """

    game_year = int(game_id[3:5])  # Extracts the season identifier
    if 46 <= game_year <= 99:
        start_year = game_year + 1900
    else:
        start_year = game_year + 2000

    end_year_short = (start_year + 1) % 100  # Get last two digits of the next year
    return f"{start_year}-{end_year_short:02d}"

### Populating the Teams Table

In [4]:
def fill_teams_df(game_id, team_stats, teams_df):

    season_year = get_season(game_id)
    team_ids = team_stats['TEAM_ID'].unique()

    # Get home team data
    team_one_row = team_stats[team_stats['TEAM_ID'] == team_ids[0]].iloc[0]
    team_one_location = team_one_row['TEAM_CITY']
    team_one_name = team_one_row['TEAM_NAME']
    team_one_abbrev = team_one_row['TEAM_ABBREVIATION']

    # Get away team data
    team_two_row = team_stats[team_stats['TEAM_ID'] == team_ids[1]].iloc[0]
    team_two_location = team_two_row['TEAM_CITY']
    team_two_name = team_two_row['TEAM_NAME']
    team_two_abbrev = team_two_row['TEAM_ABBREVIATION']

    # Convert the data to a data frame and concatenate it with the existing teams_df
    new_rows = pd.DataFrame([
        {'team_id': team_ids[0], 'season_year': season_year,
        'team_location': team_one_location, 'team_name': team_one_name,
        'team_abbreviation': team_one_abbrev},

        {'team_id': team_ids[1], 'season_year': season_year,
        'team_location': team_two_location, 'team_name': team_two_name,
        'team_abbreviation': team_two_abbrev}
    ])

    # Ensure uniqueness before concatenation (set lookup is O(1) time complexity)
    existing_keys = set(zip(teams_df['team_id'], teams_df['season_year']))
    new_rows_filtered = new_rows[~new_rows.apply(lambda row: (row['team_id'], row['season_year']) in existing_keys, axis=1)]

    # Concatenate only if new unique rows exist
    if not new_rows_filtered.empty:
        teams_df = pd.concat([teams_df, new_rows_filtered], ignore_index=True)

    return teams_df

In [25]:
# TODO: Iterate from the first NBA game to now
# TODO: Once the dataframe gets to 30 teams, stop the loop -- solved
    # TODO: This won't work for most years
    # TODO: Instead iterate through 10 days for each -- solved
start_date = date(1946, 11, 1)
today = date(2025, 3, 2)

import time
import random

teams_df = pd.DataFrame(columns=['team_id', 'season_year', 'team_location', 'team_name', 'team_abbreviation'])

current_date = start_date

current_highest_season = ""

while current_date <= today:
    # name teams_df_year, which is the teams_df for the highest year
    # teams_df_year = teams_df[teams_df["season_year"] == current_highest_season]

    # if len(teams_df_year["team_id"].unique()) >= 30:
    #     #Go to the next season -- could be year+1 if in the first half of the season or year if in the second half
    #     print(f"Finished processing games for season {current_highest_season}.")
    #     print(f"The teams for season {current_highest_season} are: {teams_df_year['team_name'].unique()}")
    #     if current_date.month < 7:
    #         current_date = date(current_date.year, 10, 22) 
    #     else:
    #         current_date = date(current_date.year + 1, 10, 22)
    #     continue

    print(f"Processing games for {current_date}...")
    time.sleep(random.uniform(0.5, 1.5) * 5)
    try:
        games = endpoints.scoreboardv2.ScoreboardV2(game_date=current_date)
        games_data = games.get_data_frames()[0]
        game_ids = games_data['GAME_ID']
        # Get stats for the first game in the list
        current_highest_season = get_season(game_ids[0])
        print(f"Current highest season: {current_highest_season}")
    except Exception as e:
        print(f"Error processing games for {current_date}: {e}. Skipping")
        current_date += timedelta(days=5)
        continue
    

    for game_id in game_ids:
        # Get all stats using BoxScoreTraditionalV2
        boxscore = endpoints.boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
        player_stats = boxscore.get_data_frames()[0]
        team_stats = boxscore.get_data_frames()[1]

        # Get general data
        game_id = player_stats['GAME_ID'][0]
        teams_df = fill_teams_df(game_id, team_stats, teams_df)
    
    current_date += timedelta(days=5)

print(teams_df)

Processing games for 1946-11-01...
Current highest season: 1946-47
Processing games for 1946-11-06...
Error processing games for 1946-11-06: 0. Skipping
Processing games for 1946-11-11...
Current highest season: 1946-47
Processing games for 1946-11-16...
Current highest season: 1946-47
Processing games for 1946-11-21...
Current highest season: 1946-47
Processing games for 1946-11-26...
Current highest season: 1946-47
Processing games for 1946-12-01...
Current highest season: 1946-47
Processing games for 1946-12-06...
Error processing games for 1946-12-06: 0. Skipping
Processing games for 1946-12-11...
Current highest season: 1946-47
Processing games for 1946-12-16...
Current highest season: 1946-47
Processing games for 1946-12-21...
Current highest season: 1946-47
Processing games for 1946-12-26...
Current highest season: 1946-47
Processing games for 1946-12-31...
Current highest season: 1946-47
Processing games for 1947-01-05...
Current highest season: 1946-47
Processing games for 194

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

In [26]:
teams_df.to_csv("nba_teams.csv", index=False)

In [None]:
import pandas as pd
from datetime import date, timedelta
import nba_api.stats.endpoints as endpoints
from nba_api.live.nba.endpoints import boxscore

start_date = date(1946, 11, 1)
today = date(1957, 11, 5)

teams_table = pd.DataFrame(columns=['team_id', 'season_year', 'team_location', 'team_name', 'team_abbreviation'])

current_date = start_date
while current_date <= today:
    print(f"Processing games for {current_date}...")
    games = endpoints.scoreboardv2.ScoreboardV2(game_date=current_date)
    games_data = games.get_data_frames()[0]

    if len(games_data) != 0:
        for game_id in games_data['GAME_ID']:
            # Get BoxScore
            box = endpoints.boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
            game_data = box.get_data_frames()[0]
            
            print(game_data)

            # Home team_id is the first team_id in the list
            # Season name is determined by game_id
                # 0024600001 - > 1946-47
                # 0021900001 -> 2019-20
                ## 00200 - 002225 -> 2000-01 to 2024-25
                ## 00246 - 00299 -> 1946-47 to 1999-00
                

            # # Extract team scores
            # home_team = game_data['homeTeam']['teamName']
            # away_team = game_data['awayTeam']['teamName']
            # home_score = game_data['homeTeam']['score']
            # away_score = game_data['awayTeam']['score']

            # Print the result
            # print(f"{away_team} ({away_score}) @ {home_team} ({home_score})")

    current_date += timedelta(days=1)
    print("\n")

Processing games for 1946-11-01...
       GAME_ID     TEAM_ID TEAM_ABBREVIATION TEAM_CITY  PLAYER_ID  \
0   0024600001  1610610035               HUS   Toronto      77035   
1   0024600001  1610610035               HUS   Toronto      76720   
2   0024600001  1610610035               HUS   Toronto      76719   
3   0024600001  1610610035               HUS   Toronto      78050   
4   0024600001  1610610035               HUS   Toronto      76762   
5   0024600001  1610610035               HUS   Toronto      77734   
6   0024600001  1610610035               HUS   Toronto      76161   
7   0024600001  1610610035               HUS   Toronto      77600   
8   0024600001  1610610035               HUS   Toronto      77503   
9   0024600001  1610610035               HUS   Toronto      78494   
10  0024600001  1610610035               HUS   Toronto      77086   
11  0024600001  1610612752               NYK  New York      77660   
12  0024600001  1610612752               NYK  New York      77672   

### Populating the Database with Player Game Stats

In [None]:
from datetime import date, timedelta
import nba_api.stats.endpoints as endpoints
from nba_api.live.nba.endpoints import boxscore

start_date = date(1946, 11, 1)
today = date(1946, 11, 5)

current_date = start_date
while current_date <= today:
    print(current_date)
    print("-------------")
    games = endpoints.scoreboardv2.ScoreboardV2(game_date=current_date)
    games_data = games.get_data_frames()[0]

    if len(games_data) != 0:
        for game_id in games_data['GAME_ID']:
            # Get BoxScore
            box = endpoints.boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
            game_data = box.get_data_frames()[0]
            
            print(game_data)

            # # Extract team scores
            # home_team = game_data['homeTeam']['teamName']
            # away_team = game_data['awayTeam']['teamName']
            # home_score = game_data['homeTeam']['score']
            # away_score = game_data['awayTeam']['score']

            # Print the result
            # print(f"{away_team} ({away_score}) @ {home_team} ({home_score})")

    current_date += timedelta(days=1)
    print("\n")