In [1]:
import pandas as pd
import time
import random
import requests
import uuid

start_year = 2023
end_year = 2024
blacklist = [(2020, 'april'), (2021, 'october'), (2021, 'november')]
months = ['january', 'february', 'march', 'april', 'may', 'june', 'october', 'november', 'december']

df = pd.DataFrame(columns=['Date', 'season_start_year', 'Visitor/Neutral', 'PTS', 'Home/Neutral', 'PTS.1', 'Unnamed: 5', 'Unnamed: 6', 'Attend.', 'Notes'])

for year in range(start_year, end_year):
    for month in months:
        if (year, month) in blacklist: continue
        url = 'https://www.basketball-reference.com/leagues/NBA_{}_games-{}.html'.format(year, month)
        for attempt in range(5):  # retry up to 5 times
            response = requests.get(url)
            if response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', 10))  # default to 10 seconds if header is missing
                print(f'Rate limited. Retrying in {retry_after} seconds...')
                time.sleep(retry_after)
            elif response.status_code == 200:
                try:
                    yearMonth_table = pd.read_html(response.content)[0]
                    yearMonth_table['season_start_year'] = year - 1
                    df = pd.concat([df, yearMonth_table])
                    print(f'Processed games for year, month: ({year}, {month})')
                    break  # exit retry loop on success
                except Exception as e:
                    print(f'Failed to parse the page: {e}')
                    if str(e) == "No tables found":
                        # Bail early if there isn't a table at this endpoint
                        break
                    time.sleep(10)
            else:
                if response.status_code == 404:
                    print(f"404 encountered for year, month: ({year}, {month}). Skipping.")
                    break
                print(f'Failed with status {response.status_code} for year, month: ({year}, {month}). Retrying in 5 seconds...')
                time.sleep(5)
            # Random delay to avoid being blocked
            time.sleep(random.uniform(5, 15))


Processed games for year, month: (2023, january)
Processed games for year, month: (2023, february)
Processed games for year, month: (2023, march)
Processed games for year, month: (2023, april)
Processed games for year, month: (2023, may)
Processed games for year, month: (2023, june)
Processed games for year, month: (2023, october)
Processed games for year, month: (2023, november)
Processed games for year, month: (2023, december)


In [2]:
secret_df = df.copy() # In case of emergencies
df.count()

Date               1320
seasonStartYear    1320
Visitor/Neutral    1320
PTS                1320
Home/Neutral       1320
PTS.1              1320
Unnamed: 5            0
Unnamed: 6         1320
Attend.            1318
Notes                 6
Start (ET)         1320
Unnamed: 7           84
Arena              1320
dtype: int64

In [3]:
"""
The covid season has two october months, which the code above does not find. I add these months by iterating through this url_lst.
"""
url_lst = ['https://www.basketball-reference.com/leagues/NBA_2020_games-october-2019.html',
          'https://www.basketball-reference.com/leagues/NBA_2020_games-october-2020.html']
for url in url_lst:
    yearMonth_table = pd.io.html.read_html(url)[0]
    yearMonth_table['season_start_year'] = 2019
    df = pd.concat([df, yearMonth_table])

In [4]:
#Fix date column
df_backup = df.copy()
df = df_backup.copy()
df = df[df.Date!='Playoffs'].copy()
df['datetime'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

  df['datetime'] = pd.to_datetime(df['Date'], infer_datetime_format=True)


In [5]:
"""
Creates a variable for playoff and regular season games.
"""
playOffStart = {1996:'1997-04-24', 1997:'1998-04-23', 1998:'1999-05-08', 1999:'2000-04-22', 2000:'2001-04-21',
               2001:'2002-04-20',2002:'2003-04-19', 2003:'2004-04-17', 2004:'2005-04-23', 2005:'2006-04-22',
               2006: '2007-04-21', 2007:'2008-04-19', 2008:'2009-04-18', 2009:'2010-04-17', 2010:'2011-04-16',
               2011:'2012-04-28', 2012:'2013-04-20', 2013:'2014-04-19', 2014:'2015-04-18', 2015:'2016-04-16',
               2016:'2017-04-15', 2017:'2018-04-14', 2018:'2019-04-13', 2019:'2020-08-15', 2020:'2021-05-18', 2021:'2022-04-16', 2022:'2023-04-15'}

df['playoff'] = df['season_start_year'].apply(lambda x: playOffStart[x])
df['is_regular'] = df['playoff'] > df['datetime']
df['is_regular'] = df['is_regular'].apply(lambda x: 1 if x else 0)
df.drop(columns=['playoff'], inplace=True)

In [6]:
# Remove exhibition games in 2018-19
df[(df.season_start_year==2018)&(df.is_regular==1)&(df['Visitor/Neutral']=='Boston Celtics')].sort_values(by=['datetime'])
df['d'] = df['datetime']> '2018-10-01'
df['y'] = df['season_start_year'] != 2018
df = df[(df['d'])|(df['y'])].copy()
df.drop(columns=['d','y'], inplace=True)

In [7]:
"""The 2012 season has 1 fewer games due to the bombing at the Boston Marathon. 
Otherwise all the seasons have the correct number of games."""

df[(df['is_regular']==1)]['season_start_year'].value_counts()

seasonStartYear
2022    1236
2019      68
Name: count, dtype: int64

In [8]:
# I drop Unnecessary columns
df.drop(columns=['Unnamed: 5','Unnamed: 6', 'Unnamed: 7', 'Date'], inplace=True)

In [9]:
#Create game_id
df = df.sort_values(by='datetime')
df['game_id'] = [str(uuid.uuid4()) for i in range(len(df))]

In [10]:
#Set column names
df = df.set_axis(['season_start_year', 'away_team', 'points_away', 'home_team', 'points_home','attendance',
            'notes', 'start_et', 'arena', 'datetime', 'is_regular', 'game_id'], axis=1)

df = df.drop(columns=['arena'])
df = df[df['season_start_year'] > 2018]

In [11]:
#Upload with new file name
df.to_csv('games-2023.csv', index = False)