Quick Feature Engineering Note:

Typically when incorporating RAPM data, we want to include data for individuals, however this is not possible.

In [1]:
!pip install nba_api
from nba_api.stats.endpoints import PlayByPlayV2
from nba_api.stats.endpoints import LeagueGameFinder
from nba_api.stats.endpoints import BoxScoreTraditionalV3
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

Collecting nba_api
  Downloading nba_api-1.11.3-py3-none-any.whl.metadata (5.8 kB)
Downloading nba_api-1.11.3-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.0/319.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api
Successfully installed nba_api-1.11.3


In [3]:
# ...existing code...
# 1. Get all games from 2018-19 and filter for Overtime
def get_ot_games(season_str):
    print("Fetching 2018-19 Regular Season games...")
    game_finder = LeagueGameFinder(season_nullable=season_str, season_type_nullable='Regular Season')
    games = game_finder.get_data_frames()[0]

    # Helper to parse minutes
    def parse_minutes(min_val):
        if pd.isna(min_val): return 0
        min_str = str(min_val)
        if ':' in min_str:
            return int(min_str.split(':')[0])
        return int(float(min_str))

    games['MIN_CLEAN'] = games['MIN'].apply(parse_minutes)

    # Filter for games longer than 250 minutes (Standard is 240, OT is usually 265)
    ot_games_rows = games[games['MIN_CLEAN'] >= 250].copy()

    # Get unique game IDs
    unique_ot_ids = ot_games_rows['GAME_ID'].unique().tolist()

    print(f"Found {len(unique_ot_ids)} unique overtime games.")
    return unique_ot_ids

# ...existing code...
# ...existing code...
# 2. Get starters for OT
def get_ot_starting_lineup(game_id):
    try:
        # Get players in Period 5 (OT1)
        box = BoxScoreTraditionalV3(game_id=game_id, start_period=5, end_period=5, range_type=0, timeout=30)

        # MANUAL PARSING: Avoid get_data_frames()
        data = box.get_dict()

        if 'boxScoreTraditional' not in data:
            return pd.DataFrame()

        bs = data['boxScoreTraditional']
        players_list = []

        # V3 Structure: boxScoreTraditional -> homeTeam/awayTeam -> players
        if 'homeTeam' in bs and 'players' in bs['homeTeam']:
            for p in bs['homeTeam']['players']:
                # Flatten the nested structure
                flat_player = {
                    'personId': p.get('personId'),
                    'firstName': p.get('firstName'),
                    'familyName': p.get('familyName'),
                    'teamId': bs['homeTeamId'],
                    'teamTricode': bs['homeTeam'].get('teamTricode')
                }
                if 'statistics' in p:
                    flat_player.update(p['statistics'])
                players_list.append(flat_player)

        if 'awayTeam' in bs and 'players' in bs['awayTeam']:
            for p in bs['awayTeam']['players']:
                flat_player = {
                    'personId': p.get('personId'),
                    'firstName': p.get('firstName'),
                    'familyName': p.get('familyName'),
                    'teamId': bs['awayTeamId'],
                    'teamTricode': bs['awayTeam'].get('teamTricode')
                }
                if 'statistics' in p:
                    flat_player.update(p['statistics'])
                players_list.append(flat_player)

        if not players_list:
            return pd.DataFrame()

        players_df = pd.DataFrame(players_list)

        # 1. Normalize column names to uppercase
        players_df.columns = [c.upper() for c in players_df.columns]

        # 2. Map V3 columns to standard names
        rename_map = {
            'PERSONID': 'PLAYER_ID',
            'TEAMID': 'TEAM_ID',
            'TEAMTRICODE': 'TEAM_ABBREVIATION',
            'MINUTES': 'MIN'
        }
        players_df = players_df.rename(columns=rename_map)

        # 3. Handle Player Name
        if 'PLAYER_NAME' not in players_df.columns:
            if 'FIRSTNAME' in players_df.columns and 'FAMILYNAME' in players_df.columns:
                players_df['PLAYER_NAME'] = players_df['FIRSTNAME'] + ' ' + players_df['FAMILYNAME']

        # 4. Add GAME_ID
        players_df['GAME_ID'] = game_id

        # 5. Filter for active players (those with minutes > 0)
        if 'MIN' not in players_df.columns:
            return pd.DataFrame()

        active_ot_players = players_df[players_df['MIN'].notna()].copy()

        # Clean minutes - remove "0:00" entries
        if active_ot_players['MIN'].dtype == object:
             active_ot_players = active_ot_players[active_ot_players['MIN'].str.contains(r'[1-9]', regex=True)]

        if active_ot_players.empty:
            return pd.DataFrame()

        candidate_ids = set(active_ot_players['PLAYER_ID'].tolist())

        # 6. Try to check substitutions (may fail for old games)
        try:
            pbp = PlayByPlayV2(game_id=game_id, start_period=5, end_period=5, timeout=30)
            pbp_df = pbp.play_by_play.get_data_frame()

            subs = pbp_df[pbp_df['EVENTMSGTYPE'] == 8]
            non_starters = set()

            for _, row in subs.iterrows():
                if row['PCTIMESTRING'] == '05:00':
                    non_starters.add(row['PLAYER1_ID'])
                else:
                    non_starters.add(row['PLAYER2_ID'])

            starter_ids = candidate_ids - non_starters
            starters_df = active_ot_players[active_ot_players['PLAYER_ID'].isin(starter_ids)].copy()
        except:
            # Fallback: Just return top 5 players by minutes for each team
            starters_df = active_ot_players.sort_values('MIN', ascending=False).groupby('TEAM_ID').head(5)

        # Ensure we return standard columns
        cols_to_return = ['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'PLAYER_ID', 'PLAYER_NAME']
        available_cols = [c for c in cols_to_return if c in starters_df.columns]

        return starters_df[available_cols]

    except Exception as e:
        print(f"Error processing game {game_id}: {e}")
        return pd.DataFrame()

In [4]:
season_strs = ["2016-17"]

all_seasons_df = pd.DataFrame()

for season_str in season_strs:
    ot_game_ids = get_ot_games(season_str)
    all_starters = []

    print(f"Extracting lineups for {len(ot_game_ids)} games...")
    for i, game_id in enumerate(ot_game_ids):
        print(f"Processing {i+1}/{len(ot_game_ids)}: {game_id}")
        df = get_ot_starting_lineup(game_id)
        if not df.empty:
            all_starters.append(df)
        time.sleep(1.0)

    if all_starters:
        final_df = pd.concat(all_starters, ignore_index=True)
        final_df["season"] = season_str
        all_seasons_df = pd.concat([all_seasons_df, final_df], ignore_index=True)
        outname = f"ot_starting_lineups_{season_str.replace('-', '_')}.csv"
    else:
        print(f"No data found for season {season_str}.")


Fetching 2018-19 Regular Season games...
Found 70 unique overtime games.
Extracting lineups for 70 games...
Processing 1/70: 0021601205
Processing 2/70: 0021601197
Processing 3/70: 0021601161
Processing 4/70: 0021601150
Processing 5/70: 0021601130
Processing 6/70: 0021601112
Processing 7/70: 0021601080
Processing 8/70: 0021601051
Processing 9/70: 0021601045
Processing 10/70: 0021600976
Processing 11/70: 0021600983
Processing 12/70: 0021600972
Processing 13/70: 0021600963
Processing 14/70: 0021600931
Processing 15/70: 0021600925
Processing 16/70: 0021600917
Processing 17/70: 0021600890
Processing 18/70: 0021600881
Processing 19/70: 0021600859
Processing 20/70: 0021600851
Processing 21/70: 0021600830
Processing 22/70: 0021600797
Processing 23/70: 0021600784
Processing 24/70: 0021600769
Processing 25/70: 0021600764
Processing 26/70: 0021600749
Processing 27/70: 0021600725
Processing 28/70: 0021600720
Processing 29/70: 0021600711
Processing 30/70: 0021600706
Processing 31/70: 0021600696
Pr

In [5]:
all_seasons_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER_NAME,season
0,21601205,1610612748,MIA,203186,Willie Reed,2016-17
1,21601205,1610612739,CLE,1627770,Kay Felder,2016-17
2,21601205,1610612748,MIA,1626196,Josh Richardson,2016-17
3,21601205,1610612739,CLE,101114,Deron Williams,2016-17
4,21601205,1610612748,MIA,201609,Goran Dragic,2016-17


In [6]:
import unicodedata

# Function to remove accents from text
def remove_accents(text):
    if pd.isna(text):
        return text
    # Normalize to NFD (decomposed form) and filter out combining characters
    nfd = unicodedata.normalize('NFD', text)
    return ''.join(char for char in nfd if unicodedata.category(char) != 'Mn')

# Clean player names in final_df
all_seasons_df['PLAYER_NAME_CLEAN'] = all_seasons_df['PLAYER_NAME'].apply(remove_accents)

print("Cleaned player names:")
print(all_seasons_df[['PLAYER_NAME', 'PLAYER_NAME_CLEAN']].drop_duplicates().tail(20))

Cleaned player names:
             PLAYER_NAME    PLAYER_NAME_CLEAN
549       Jerryd Bayless       Jerryd Bayless
551   Glenn Robinson III   Glenn Robinson III
570  Juancho Hernangomez  Juancho Hernangomez
572     Danilo Gallinari     Danilo Gallinari
585     Sergio Rodriguez     Sergio Rodriguez
588     Gerald Henderson     Gerald Henderson
612           J.J. Barea           J.J. Barea
616         Andrew Bogut         Andrew Bogut
618  Dorian Finney-Smith  Dorian Finney-Smith
623        E'Twaun Moore        E'Twaun Moore
637       Brandon Knight       Brandon Knight
639       Meyers Leonard       Meyers Leonard
641     Lance Stephenson     Lance Stephenson
644      James Ennis III      James Ennis III
651         Dion Waiters         Dion Waiters
670          Noah Vonleh          Noah Vonleh
674          Will Barton          Will Barton
679       Kenneth Faried       Kenneth Faried
693            Joe Young            Joe Young
696        Dirk Nowitzki        Dirk Nowitzki


In [8]:
all_seasons_df.to_csv('2016-17.csv')

In [None]:
unique_game_ids = all_seasons_df["GAME_ID"].unique()

In [None]:
from nba_api.stats.endpoints import BoxScoreSummaryV2
import pandas as pd
import time

# ---------------------------------------------------------
# Function: Given a GAME_ID, return which team won
# ---------------------------------------------------------
def get_winner(game_id):
    # NBA API request
    summary = BoxScoreSummaryV2(game_id=game_id).get_data_frames()

    # Line score table: contains info for both teams (home & away)
    line_score = summary[5]

    # Extract rows
    home_row = line_score[line_score["TEAM_CITY_NAME"].notna()].iloc[0]
    away_row = line_score[line_score["TEAM_CITY_NAME"].notna()].iloc[1]

    home_team = home_row["TEAM_ABBREVIATION"]
    away_team = away_row["TEAM_ABBREVIATION"]

    home_score = home_row["PTS"]
    away_score = away_row["PTS"]

    # Determine winner
    if home_score > away_score:
        return {
            "GAME_ID": game_id,
            "WINNER_SIDE": "HOME",
            "WINNER_TEAM": home_team,
            "WINNER_TEAM_ID": home_row["TEAM_ID"]
        }
    else:
        return {
            "GAME_ID": game_id,
            "WINNER_SIDE": "AWAY",
            "WINNER_TEAM": away_team,
            "WINNER_TEAM_ID": away_row["TEAM_ID"]
        }