In [92]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
from datetime import datetime

In [93]:
STANDINGS_DIR = "data/standings"

In [94]:
def get_current_nba_season():
    now = datetime.now()
    month = now.month
    year = now.year
    if (month >= 10): #If october, then in the next year's season (we call 2024-25 season "2025" in the files)
        return str(year+1)
    else: #If before october, then in current year 
        return str(year)

In [95]:
season_year = get_current_nba_season()
season_year

'2025'

In [96]:
standings = os.listdir(STANDINGS_DIR)
standings = [x for x in standings if season_year in x] # Filter current szn year
standings = [os.path.join(STANDINGS_DIR, f) for f in standings if f.endswith(".html")] #join together full path to standings
if len(standings) == 0:
    print(f"You have no files for the {year} season, may have to scrape from get_data")
standings

['data/standings/NBA_2025_games-april.html',
 'data/standings/NBA_2025_games-december.html',
 'data/standings/NBA_2025_games-february.html',
 'data/standings/NBA_2025_games-january.html',
 'data/standings/NBA_2025_games-march.html',
 'data/standings/NBA_2025_games-november.html',
 'data/standings/NBA_2025_games-october.html']

In [97]:
def parse_month(standing, drop_past = True):
    with open(standing, encoding = 'utf-8') as f:
        html = f.read()
    soup = BeautifulSoup(html)
    tbl = soup.find("table", {"id": "schedule"})
    [s.decompose() for s in tbl.select("tr.thead")] #thead class is strictly repeated headers, remove them so they don't show
    
    df = pd.read_html(StringIO(str(tbl)))[0] #read_html returns list of df's so index
    #Rename box-score column, this is how we'll know if it is a future game or not (only finished games have them)
    df = df.rename(columns={'Unnamed: 6': 'Box Score'})
    #Drop all extra columns to the right of box-score 
    col_index = df.columns.get_loc('Box Score') 
    df = df.iloc[:, :col_index + 1]
    #If we only want future games, take only the rows with NAN box-scores
    if drop_past:
        df = df[df['Box Score'].isna()]
    return df

In [98]:
all_standings = []
#Loop through all the months of current year and process tables
for standing in standings:
    df = parse_month(standing)
    all_standings.append(df)
#Combine all the months into one table
future_games = pd.concat(all_standings)
#Re-format date so it's cleaner, then sort by date
future_games['Date'] = pd.to_datetime(future_games['Date'], format='%a, %b %d, %Y')
future_games['Date'] = future_games['Date'].dt.strftime('%Y-%m-%d') # Makes date a string in YYMMDD
future_games = future_games.sort_values(by='Date')
future_games = future_games.reset_index(drop=True)

In [99]:
# We can drop our completely null columns as well (PTS, PTS.1, Box Score) since game hasn't been played
future_games = future_games.dropna(axis = 1, how = 'all')

In [100]:
future_games

Unnamed: 0,Date,Start (ET),Visitor/Neutral,Home/Neutral
0,2025-01-29,8:00p,Dallas Mavericks,New Orleans Pelicans
1,2025-01-29,7:00p,Brooklyn Nets,Charlotte Hornets
2,2025-01-29,7:00p,Detroit Pistons,Indiana Pacers
3,2025-01-29,7:00p,Toronto Raptors,Washington Wizards
4,2025-01-29,7:30p,Chicago Bulls,Boston Celtics
...,...,...,...,...
537,2025-04-13,1:00p,New York Knicks,Brooklyn Nets
538,2025-04-13,1:00p,Charlotte Hornets,Boston Celtics
539,2025-04-13,1:00p,Orlando Magic,Atlanta Hawks
540,2025-04-13,3:30p,Los Angeles Clippers,Golden State Warriors


In [101]:
# When we use this data to fill in data in our nba_games dataframe, we will need abbreviations of teams
import team_abbrev 
team_abbrev.abbrev_dict["Atlanta Hawks"]

'ATL'

In [102]:
def add_abbrev(game, home):
    if home:
        return team_abbrev.abbrev_dict[game["Home/Neutral"]]
    return team_abbrev.abbrev_dict[game["Visitor/Neutral"]]

future_games["home_abbrev"] = future_games.apply(add_abbrev, home=True, axis=1) #Axis = 1 means we are working on each row
future_games["away_abbrev"] = future_games.apply(add_abbrev, home=False, axis=1)

In [103]:
#We now have columns for abbreviations of each team
future_games

Unnamed: 0,Date,Start (ET),Visitor/Neutral,Home/Neutral,home_abbrev,away_abbrev
0,2025-01-29,8:00p,Dallas Mavericks,New Orleans Pelicans,NOP,DAL
1,2025-01-29,7:00p,Brooklyn Nets,Charlotte Hornets,CHO,BRK
2,2025-01-29,7:00p,Detroit Pistons,Indiana Pacers,IND,DET
3,2025-01-29,7:00p,Toronto Raptors,Washington Wizards,WAS,TOR
4,2025-01-29,7:30p,Chicago Bulls,Boston Celtics,BOS,CHI
...,...,...,...,...,...,...
537,2025-04-13,1:00p,New York Knicks,Brooklyn Nets,BRK,NYK
538,2025-04-13,1:00p,Charlotte Hornets,Boston Celtics,BOS,CHO
539,2025-04-13,1:00p,Orlando Magic,Atlanta Hawks,ATL,ORL
540,2025-04-13,3:30p,Los Angeles Clippers,Golden State Warriors,GSW,LAC


In [106]:
# If you want to save only the games on and after the current date, use this then export
# This way, you don't have to re-scrape standings files to check which games are new (just do once during season to get schedule)
def remove_old_games(df, curr_date):
    # Lambda makes anon function that lets us pass params into strptime while applying to entire column (could also pass parse_date)
    # Note that we access the .date() method because otherwise it compares times as well and doesn't do accurate comparison
    df = df.drop(
        df[df["Date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date()) < curr_date.date()].index
    ) 
    return df

def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d")

curr_date = datetime.today()
future_games = remove_old_games(future_games, curr_date)
future_games

Unnamed: 0,Date,Start (ET),Visitor/Neutral,Home/Neutral,home_abbrev,away_abbrev
23,2025-02-01,8:30p,Los Angeles Lakers,New York Knicks,NYK,LAL
24,2025-02-01,10:00p,Phoenix Suns,Portland Trail Blazers,POR,PHO
25,2025-02-01,8:30p,Miami Heat,San Antonio Spurs,SAS,MIA
26,2025-02-01,8:00p,Sacramento Kings,Oklahoma City Thunder,OKC,SAC
27,2025-02-01,5:00p,Atlanta Hawks,Indiana Pacers,IND,ATL
...,...,...,...,...,...,...
537,2025-04-13,1:00p,New York Knicks,Brooklyn Nets,BRK,NYK
538,2025-04-13,1:00p,Charlotte Hornets,Boston Celtics,BOS,CHO
539,2025-04-13,1:00p,Orlando Magic,Atlanta Hawks,ATL,ORL
540,2025-04-13,3:30p,Los Angeles Clippers,Golden State Warriors,GSW,LAC


In [107]:
future_games.to_csv('future_test2.csv', index=False)
#future_games.to_csv(f'data/future_games/future_games_{season_year}.csv')