In [1]:
import pandas as pd
import numpy as np
import time
import warnings
import duckdb

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
con = duckdb.connect(database=":memory:")

In [2]:
def add_new_gms(df):
    df_schd = pd.read_csv(f"../tables/{year}/nba_schedule.csv")
    df_schd['Date'] = pd.to_datetime(df_schd.Date)

    df['game_id'] = df.Date.astype(str) + "_" + df.AwayABV + "_" + df.HomeABV
    df_schd['game_id'] = df_schd.Date.astype(str) + "_" + df_schd.AwayABV + "_" + df_schd.HomeABV
    retrieved = df_schd[(df_schd.rtrvd == 1)].game_id.unique().tolist()

    df['rtrvd'] = np.where(df.game_id.isin(retrieved), 1, 0)
    df = df.drop('game_id', axis=1)
    
    return df

In [3]:
last_reg_gm = {2025: "2026-04-12", 2024: "2025-04-13", 2023: "2024-04-14", 2022: "2023-04-09"}
year = 2025

df = pd.DataFrame()
for month in ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']:
    time.sleep(6)
    try:
        df_temp = pd.read_html(f"https://www.basketball-reference.com/leagues/NBA_{year + 1}_games-{month}.html")[0]
    except:
        pass # no playoff games yet
    df = pd.concat([df, df_temp])
df = df[['Date', 'Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Arena', 'Notes']].rename(columns={"Visitor/Neutral": "AwayTeam", "Home/Neutral": "HomeTeam", "Start (ET)": "StartTime_ET"})
df['Date'] = pd.to_datetime(df.Date)

df_teams = pd.read_csv("../src/team_info_xref.csv")
df = con.execute(f"""SELECT Date, StartTime_ET, AwayTeam, ABV as AwayABV, HomeTeam, HomeABV, Arena, Notes FROM
                    (SELECT * EXCLUDE(ABV), ABV as HomeABV FROM df JOIN df_teams ON df.HomeTeam = df_teams.Team) t1
                    JOIN df_teams ON t1.AwayTeam = df_teams.Team
                    """).fetchdf().drop_duplicates()

# Create B2B columns
home = df[['Date', 'HomeABV']].rename(columns={'HomeABV':'Team'})
away = df[['Date', 'AwayABV']].rename(columns={'AwayABV':'Team'})
games = pd.concat([home, away], ignore_index=True).drop_duplicates()
games['Date'] = pd.to_datetime(games.Date)
games = games.sort_values(['Team', 'Date'])
games['prev_date'] = games.groupby('Team')['Date'].shift(1)
games['is_b2b'] = (games['Date'] - games['prev_date']).dt.days.eq(1).astype(int)

df = df.merge(
    games.rename(columns={'Team': 'AwayABV', 'is_b2b': 'AwayB2B'})[['Date', 'AwayABV', 'AwayB2B']],
    on=['Date', 'AwayABV'],
    how='left'
)

df = df.merge(
    games.rename(columns={'Team': 'HomeABV', 'is_b2b': 'HomeB2B'})[['Date', 'HomeABV', 'HomeB2B']],
    on=['Date', 'HomeABV'],
    how='left'
)

df['cup_gm'] = np.where(df.Notes.isin(['NBA Cup', 'In-Season Tournament']), 1, 0)
df['pstszn_gm'] = np.where(df.Date <= last_reg_gm[year], 0, 1)
df['rtrvd'] = 0
df = df.drop('Notes', axis=1)
df = add_new_gms(df)

df.to_csv(f"../tables/{year}/nba_schedule.csv", index=False)
display(df)

Unnamed: 0,Date,StartTime_ET,AwayTeam,AwayABV,HomeTeam,HomeABV,Arena,AwayB2B,HomeB2B,cup_gm,pstszn_gm,rtrvd
0,2025-10-21,7:30p,Houston Rockets,HOU,Oklahoma City Thunder,OKC,Paycom Center,0,0,0,0,1
1,2025-10-21,10:00p,Golden State Warriors,GSW,Los Angeles Lakers,LAL,Crypto.com Arena,0,0,0,0,1
2,2025-10-22,7:00p,Brooklyn Nets,BRK,Charlotte Hornets,CHO,Spectrum Center,0,0,0,0,1
3,2025-10-22,7:00p,Cleveland Cavaliers,CLE,New York Knicks,NYK,Madison Square Garden (IV),0,0,0,0,1
4,2025-10-22,7:00p,Miami Heat,MIA,Orlando Magic,ORL,Kia Center,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2026-04-12,8:30p,Utah Jazz,UTA,Los Angeles Lakers,LAL,Crypto.com Arena,0,0,0,0,0
1226,2026-04-12,8:30p,New Orleans Pelicans,NOP,Minnesota Timberwolves,MIN,Target Center,0,0,0,0,0
1227,2026-04-12,8:30p,Phoenix Suns,PHO,Oklahoma City Thunder,OKC,Paycom Center,0,0,0,0,0
1228,2026-04-12,8:30p,Sacramento Kings,SAC,Portland Trail Blazers,POR,Moda Center,0,0,0,0,0
