In [2]:
!pip install nba_api geopy

import pandas as pd
import numpy as np
from nba_api.stats.static import teams
from nba_api.stats.endpoints import TeamGameLog, TeamDetails
from geopy.distance import geodesic
import time


Collecting nba_api
  Downloading nba_api-1.11.3-py3-none-any.whl.metadata (5.8 kB)
Downloading nba_api-1.11.3-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.0/319.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api
Successfully installed nba_api-1.11.3


In [3]:
# NBA teams
nba_teams = teams.get_teams()

# Maps from NBA API
team_id_map = {t["full_name"]: t["id"] for t in nba_teams}
abbr_to_full = {t["abbreviation"]: t["full_name"] for t in nba_teams}

# Fix TEAM_NAME abbreviation → full team name
def fix_team_names(df):
    df["TEAM_NAME"] = df["TEAM_NAME"].map(abbr_to_full)
    return df

# Hard-coded arena coordinates
arena_coords = {
    "Atlanta Hawks": (33.7573, -84.3963),
    "Boston Celtics": (42.3662, -71.0621),
    "Brooklyn Nets": (40.6826, -73.9754),
    "Charlotte Hornets": (35.2251, -80.8392),
    "Chicago Bulls": (41.8807, -87.6742),
    "Cleveland Cavaliers": (41.4965, -81.6882),
    "Dallas Mavericks": (32.7905, -96.8104),
    "Denver Nuggets": (39.7487, -105.0077),
    "Detroit Pistons": (42.3411, -83.0553),
    "Golden State Warriors": (37.7678, -122.3877),
    "Houston Rockets": (29.7508, -95.3621),
    "Indiana Pacers": (39.7639, -86.1555),
    # Clippers – include BOTH possible names
    "LA Clippers": (34.0430, -118.2673),
    "Los Angeles Clippers": (34.0430, -118.2673),
    "Los Angeles Lakers": (34.0430, -118.2673),
    "Memphis Grizzlies": (35.1382, -90.0506),
    "Miami Heat": (25.7814, -80.1870),
    "Milwaukee Bucks": (43.0451, -87.9172),
    "Minnesota Timberwolves": (44.9795, -93.2760),
    "New Orleans Pelicans": (29.9490, -90.0815),
    "New York Knicks": (40.7505, -73.9934),
    "Oklahoma City Thunder": (35.4634, -97.5151),
    "Orlando Magic": (28.5392, -81.3839),
    "Philadelphia 76ers": (39.9012, -75.1720),
    "Phoenix Suns": (33.4457, -112.0712),
    "Portland Trail Blazers": (45.5316, -122.6668),
    "Sacramento Kings": (38.5802, -121.4997),
    "San Antonio Spurs": (29.4269, -98.4375),
    "Toronto Raptors": (43.6435, -79.3791),
    "Utah Jazz": (40.7683, -111.9011),
    "Washington Wizards": (38.8981, -77.0209),
}



In [10]:
def get_season_game_data(season: str) -> pd.DataFrame:
    print(f"Downloading {season} season data...")
    all_rows = []
    for t in nba_teams:
        team_name = t["full_name"]
        team_id   = t["id"]
        try:
            logs = TeamGameLog(team_id=team_id, season=season).get_data_frames()[0]
            logs["TEAM_NAME"] = team_name          # full name
            logs["TEAM_ID"]   = team_id
            logs["SEASON"]    = season
            all_rows.append(logs)
            time.sleep(0.6)                        # be nice to the API
        except Exception as e:
            print(f"Failed for {team_name}: {e}")

    df = pd.concat(all_rows, ignore_index=True)
    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
    df = df.sort_values(["TEAM_ID", "GAME_DATE"])
    return df


In [11]:
def add_opponent_info(df: pd.DataFrame) -> pd.DataFrame:
    # pull opponent abbreviation from MATCHUP ("ATL @ BOS" → "BOS")
    df["OPP_ABBR"] = df["MATCHUP"].str.extract(r"(?:vs\.|@) ([A-Z]{2,3})")

    # map abbreviation → full team name, then to ID
    df["OPPONENT"]    = df["OPP_ABBR"].map(abbr_to_full)
    df["OPPONENT_ID"] = df["OPPONENT"].map(team_id_map)

    # home / away
    df["HOME_AWAY"] = df["MATCHUP"].apply(lambda x: "Away" if "@" in x else "Home")
    return df


In [12]:
def add_rest_features(df: pd.DataFrame) -> pd.DataFrame:
    df["PREV_GAME_DATE"] = df.groupby("TEAM_ID")["GAME_DATE"].shift(1)
    df["DAYS_REST"] = (df["GAME_DATE"] - df["PREV_GAME_DATE"]).dt.days
    df["DAYS_REST"] = df["DAYS_REST"].fillna(5)          # first game = long rest
    df["BACK_TO_BACK"] = (df["DAYS_REST"] == 1).astype(int)
    return df


In [13]:
def add_travel_distance(df: pd.DataFrame) -> pd.DataFrame:
    # Determine game location (arena) for each row:
    #   Home  → team arena
    #   Away  → opponent arena
    def get_game_coords(row):
        if row["HOME_AWAY"] == "Home":
            name = row["TEAM_NAME"]
        else:
            name = row["OPPONENT"]
        lat, lon = arena_coords.get(name, (np.nan, np.nan))
        return pd.Series({"ARENA_LAT": lat, "ARENA_LON": lon})

    coords = df.apply(get_game_coords, axis=1)
    df["ARENA_LAT"] = coords["ARENA_LAT"]
    df["ARENA_LON"] = coords["ARENA_LON"]

    # previous game location for the same team
    df["PREV_LAT"] = df.groupby("TEAM_ID")["ARENA_LAT"].shift(1)
    df["PREV_LON"] = df.groupby("TEAM_ID")["ARENA_LON"].shift(1)

    # distance between consecutive game locations
    def compute_distance(row):
        if pd.isna(row["PREV_LAT"]) or pd.isna(row["ARENA_LAT"]):
            return 0.0
        return geodesic(
            (row["PREV_LAT"], row["PREV_LON"]),
            (row["ARENA_LAT"], row["ARENA_LON"])
        ).miles

    df["TRAVEL_DISTANCE"] = df.apply(compute_distance, axis=1)
    return df


In [14]:
df_2023 = get_season_game_data("2023-24")
df_2024 = get_season_game_data("2024-25")

df = pd.concat([df_2023, df_2024], ignore_index=True)
df = df.sort_values(["TEAM_ID", "GAME_DATE"])

df = add_opponent_info(df)
df = add_rest_features(df)
df = add_travel_distance(df)


df.head()


Downloading 2023-24 season data...


  df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])


Downloading 2024-25 season data...


  df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,W_PCT,MIN,FGM,...,OPPONENT_ID,HOME_AWAY,PREV_GAME_DATE,DAYS_REST,BACK_TO_BACK,ARENA_LAT,ARENA_LON,PREV_LAT,PREV_LON,TRAVEL_DISTANCE
0,1610612737,22300063,2023-10-25,ATL @ CHA,L,0,1,0.0,240,39,...,1610612766,Away,NaT,5.0,0,35.2251,-80.8392,,,0.0
1,1610612737,22300079,2023-10-27,ATL vs. NYK,L,0,2,0.0,240,42,...,1610612752,Home,2023-10-25,2.0,0,33.7573,-84.3963,35.2251,-80.8392,226.806206
2,1610612737,22300097,2023-10-29,ATL @ MIL,W,1,2,0.333,240,47,...,1610612749,Away,2023-10-27,2.0,0,43.0451,-87.9172,33.7573,-84.3963,668.347914
3,1610612737,22300104,2023-10-30,ATL vs. MIN,W,2,2,0.5,240,48,...,1610612750,Home,2023-10-29,1.0,1,33.7573,-84.3963,43.0451,-87.9172,668.347914
4,1610612737,22300117,2023-11-01,ATL vs. WAS,W,3,2,0.6,240,46,...,1610612764,Home,2023-10-30,2.0,0,33.7573,-84.3963,33.7573,-84.3963,0.0


In [15]:
df.to_csv("NBA_FULL_DATASET_2023_2025.csv", index=False)
df.head()

Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,W_PCT,MIN,FGM,...,OPPONENT_ID,HOME_AWAY,PREV_GAME_DATE,DAYS_REST,BACK_TO_BACK,ARENA_LAT,ARENA_LON,PREV_LAT,PREV_LON,TRAVEL_DISTANCE
0,1610612737,22300063,2023-10-25,ATL @ CHA,L,0,1,0.0,240,39,...,1610612766,Away,NaT,5.0,0,35.2251,-80.8392,,,0.0
1,1610612737,22300079,2023-10-27,ATL vs. NYK,L,0,2,0.0,240,42,...,1610612752,Home,2023-10-25,2.0,0,33.7573,-84.3963,35.2251,-80.8392,226.806206
2,1610612737,22300097,2023-10-29,ATL @ MIL,W,1,2,0.333,240,47,...,1610612749,Away,2023-10-27,2.0,0,43.0451,-87.9172,33.7573,-84.3963,668.347914
3,1610612737,22300104,2023-10-30,ATL vs. MIN,W,2,2,0.5,240,48,...,1610612750,Home,2023-10-29,1.0,1,33.7573,-84.3963,43.0451,-87.9172,668.347914
4,1610612737,22300117,2023-11-01,ATL vs. WAS,W,3,2,0.6,240,46,...,1610612764,Home,2023-10-30,2.0,0,33.7573,-84.3963,33.7573,-84.3963,0.0
