In [196]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time

SEASONS = list(range(2013,2024))


In [197]:
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

## Download all TOTAL_STATS

In [200]:
TYPE = "TOTALS_STATS"

TOTALS_STATS_COLUMNS_PRE = [
    'rank', 'team', 'games', 'minutes_played', 'field_goals', 'field_goals_attempted', 'field_goal_pct',
    'three_point_field_goals', 'three_point_attempts', 'three_point_pct', 'two_point_field_goals',
    'two_point_attempts', 'two_point_pct', 'free_throws', 'free_throw_attempts', 'free_throw_pct',
    'offensive_rebounds', 'defensive_rebounds', 'total_rebounds', 'assists', 'steals', 'blocks',
    'turnovers', 'personal_fouls', 'points'
]

TOTALS_STATS_COLUMNS = [f"totals_{field}" for field in TOTALS_STATS_COLUMNS_PRE]

final_df = pd.DataFrame(columns=TOTALS_STATS_COLUMNS)

### Logic loop
for season in SEASONS:
    time.sleep(10)
    
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}.html"
    html = await get_html(url, "#totals-team")
    soup = BeautifulSoup(html)
    all_rows = soup.find_all('tr')
    df = pd.DataFrame(columns=TOTALS_STATS_COLUMNS)
    
    for i in range(1, len(all_rows)):
        raw_row_data = all_rows[i]
        row_data = [cell.get_text(strip=True) for cell in raw_row_data.find_all(['th', 'td'])]
        if not (('Rk' in row_data) and ('Team' in row_data)):
            df.loc[len(df)] = row_data
    df.to_csv(f"data/{TYPE}/totals_stats_{season}.csv")
    
    final_df = pd.concat([final_df, df], ignore_index=True)

final_df.to_csv(f"data/{TYPE}/total_stats_final.csv")
final_df

2012-13 NBA Season Summary | Basketball-Reference.com
2013-14 NBA Season Summary | Basketball-Reference.com
2014-15 NBA Season Summary | Basketball-Reference.com
2015-16 NBA Season Summary | Basketball-Reference.com
2016-17 NBA Season Summary | Basketball-Reference.com
2017-18 NBA Season Summary | Basketball-Reference.com
2018-19 NBA Season Summary | Basketball-Reference.com
2019-20 NBA Season Summary | Basketball-Reference.com
2020-21 NBA Season Summary | Basketball-Reference.com
2021-22 NBA Season Summary | Basketball-Reference.com
2022-23 NBA Season Summary | Basketball-Reference.com


Unnamed: 0,totals_rank,totals_team,totals_games,totals_minutes_played,totals_field_goals,totals_field_goals_attempted,totals_field_goal_pct,totals_three_point_field_goals,totals_three_point_attempts,totals_three_point_pct,...,totals_free_throw_pct,totals_offensive_rebounds,totals_defensive_rebounds,totals_total_rebounds,totals_assists,totals_steals,totals_blocks,totals_turnovers,totals_personal_fouls,totals_points
0,1,Denver Nuggets*,82,19905,3339,6983,.478,521,1518,.343,...,.701,1092,2601,3693,2002,762,533,1253,1682,8704
1,2,Houston Rockets*,82,19780,3124,6782,.461,867,2369,.366,...,.754,909,2652,3561,1902,679,359,1348,1662,8688
2,3,Oklahoma City Thunder*,82,19830,3126,6504,.481,598,1588,.377,...,.828,854,2725,3579,1753,679,624,1253,1654,8669
3,4,San Antonio Spurs*,82,19880,3210,6675,.481,663,1764,.376,...,.791,666,2721,3387,2058,695,446,1206,1427,8448
4,5,Miami Heat*,82,19880,3148,6348,.496,717,1809,.396,...,.754,676,2490,3166,1890,710,441,1143,1533,8436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,27,Charlotte Hornets,82,19830,3385,7413,.457,881,2669,.330,...,.749,901,2751,3652,2062,634,425,1164,1661,9098
337,28,Houston Rockets,82,19755,3329,7287,.457,856,2619,.327,...,.754,1100,2695,3795,1835,600,374,1332,1679,9081
338,29,Detroit Pistons,82,19805,3244,7140,.454,934,2659,.351,...,.771,916,2564,3480,1884,574,308,1237,1813,9045
339,30,Miami Heat*,82,19805,3215,6991,.460,980,2852,.344,...,.831,796,2533,3329,1955,655,243,1106,1516,8977


## Download all ADVANCED_STATS

In [201]:
TYPE = "ADVANCED_STATS"

ADVANCED_STATS_COLUMNS_PRE = [
    'rank', 'team', 'age', 'wins', 'losses', 'pythagorean_wins', 'pythagorean_losses', 'margin_of_victory',
    'strength_of_schedule', 'simple_rating_system', 'offensive_rating', 'defensive_rating', 'net_rating',
    'pace', 'free_throw_rate', 'three_point_attempt_rate', 'true_shooting_pct', '',

    # Offensive stats
    'offensive_effective_fg_pct', 'offensive_turnover_pct', 'offensive_offensive_rebound_pct', 'offensive_ft_per_fga', '',

    # Defensive stats
    'defensive_effective_fg_pct', 'defensive_turnover_pct', 'defensive_defensive_rebound_pct', 'defensive_ft_per_fga', '',

    # Arena and attendance stats
    'arena', 'attendance', 'attendance_per_game'
]

ADVANCED_STATS_COLUMNS = [f"advanced_{field}" for field in ADVANCED_STATS_COLUMNS_PRE]

final_df = pd.DataFrame(columns=ADVANCED_STATS_COLUMNS)

### Logic loop
for season in SEASONS:
    time.sleep(10)
    
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}.html"
    html = await get_html(url, "#advanced-team")
    soup = BeautifulSoup(html)
    all_rows = soup.find_all('tr')
    df = pd.DataFrame(columns=ADVANCED_STATS_COLUMNS)
    
    for i in range(2, len(all_rows)):
        raw_row_data = all_rows[i]
        row_data = [cell.get_text(strip=True) for cell in raw_row_data.find_all(['th', 'td'])]
        if (('Offense Four Factors' not in row_data) and ('Defense Four Factors') not in row_data):
            df.loc[len(df)] = row_data
    df.to_csv(f"data/{TYPE}/advanced_stats_{season}.csv")
    final_df = pd.concat([final_df, df], ignore_index=True)
    

final_df.to_csv(f"data/{TYPE}/advanced_stats_final.csv")
final_df

2012-13 NBA Season Summary | Basketball-Reference.com
2013-14 NBA Season Summary | Basketball-Reference.com
2014-15 NBA Season Summary | Basketball-Reference.com
2015-16 NBA Season Summary | Basketball-Reference.com
2016-17 NBA Season Summary | Basketball-Reference.com
2017-18 NBA Season Summary | Basketball-Reference.com
2018-19 NBA Season Summary | Basketball-Reference.com
2019-20 NBA Season Summary | Basketball-Reference.com
2020-21 NBA Season Summary | Basketball-Reference.com
2021-22 NBA Season Summary | Basketball-Reference.com
2022-23 NBA Season Summary | Basketball-Reference.com


Unnamed: 0,advanced_rank,advanced_team,advanced_age,advanced_wins,advanced_losses,advanced_pythagorean_wins,advanced_pythagorean_losses,advanced_margin_of_victory,advanced_strength_of_schedule,advanced_simple_rating_system,...,advanced_offensive_ft_per_fga,advanced_,advanced_defensive_effective_fg_pct,advanced_defensive_turnover_pct,advanced_defensive_defensive_rebound_pct,advanced_defensive_ft_per_fga,advanced_.1,advanced_arena,advanced_attendance,advanced_attendance_per_game
0,1,Oklahoma City Thunder*,26.0,60,22,64,18,9.21,-0.06,9.15,...,.280,,.469,13.5,73.4,.197,,Chesapeake Energy Arena,746323,18203
1,2,Miami Heat*,30.3,66,16,62,20,7.87,-0.84,7.03,...,.224,,.487,14.8,73.0,.200,,AmericanAirlines Arena,819290,19983
2,3,Los Angeles Clippers*,28.8,56,26,59,23,6.45,-0.02,6.43,...,.203,,.492,15.4,73.5,.229,,STAPLES Center,788293,19227
3,4,San Antonio Spurs*,28.6,58,24,58,24,6.40,0.27,6.67,...,.204,,.480,13.7,74.9,.179,,AT&T Center,755700,18432
4,5,Denver Nuggets*,26.1,57,25,55,27,5.09,0.28,5.37,...,.216,,.493,14.3,71.8,.193,,Pepsi Center,730616,17820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,27,Charlotte Hornets,25.3,27,55,26,56,-6.24,0.35,-5.89,...,.195,,.544,12.5,75.5,.211,,Spectrum Center,702052,17123
347,28,Houston Rockets,22.1,22,60,23,59,-7.85,0.24,-7.62,...,.215,,.564,11.8,75.8,.218,,Toyota Center,668865,16314
348,29,Detroit Pistons,24.1,17,65,22,60,-8.22,0.49,-7.73,...,.227,,.557,11.9,74.0,.231,,Little Caesars Arena,759715,18596
349,30,San Antonio Spurs,23.9,22,60,19,63,-10.04,0.22,-9.82,...,.170,,.576,12.0,74.9,.201,,AT&T Center,694434,15508


## Download all SHOOTING_STATS

In [202]:
TYPE = "SHOOTING_STATS"
SHOOTING_STATS_COLUMNS_PRE = [
    'rank', 'team', 'games', 'minutes_played', 'field_goal_pct', 'average_shot_distance', '',

    # % of FGA by Distance
    'two_point_attempt_pct', 'fga_0_3_pct', 'fga_3_10_pct', 'fga_10_16_pct', 'fga_16_to_3pt_pct', 'three_point_attempt_pct', '',

    # FG% by Distance
    'two_point_fg_pct', 'fg_0_3_pct', 'fg_3_10_pct', 'fg_10_16_pct', 'fg_16_to_3pt_pct', 'three_point_fg_pct', '',

    # % of FG Ast'd
    'two_point_fg_assisted_pct', 'three_point_fg_assisted_pct', '',

    # Dunks stats
    'dunks_fga_pct', 'dunks_made', '',

    # Layups stats
    'layups_fga_pct', 'layups_made', '',

    # Corner 3-point attempts and accuracy
    'corner_three_pct_fga', 'corner_three_fg_pct', '',

    # Heaves stats
    'heaves_attempted', 'heaves_made'
]

SHOOTING_STATS_COLUMNS = [f"shooting_{field}" for field in SHOOTING_STATS_COLUMNS_PRE]

final_df = pd.DataFrame(columns=SHOOTING_STATS_COLUMNS)

### Logic loop
for season in SEASONS:
    time.sleep(10)
    
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}.html"
    html = await get_html(url, "#shooting-team")
    soup = BeautifulSoup(html)
    all_rows = soup.find_all('tr')
    df = pd.DataFrame(columns=SHOOTING_STATS_COLUMNS)
    
    for i in range(1, len(all_rows)):
        raw_row_data = all_rows[i]
        row_data = [cell.get_text(strip=True) for cell in raw_row_data.find_all(['th', 'td'])]
        if not ('% of FGA by Distance' in row_data):
            df.loc[len(df)] = row_data
    df.to_csv(f"data/{TYPE}/shooting_stats_{season}.csv")
    
    final_df = pd.concat([final_df, df], ignore_index=True)

final_df.to_csv(f"data/{TYPE}/shooting_stats_final.csv")
final_df

2012-13 NBA Season Summary | Basketball-Reference.com
2013-14 NBA Season Summary | Basketball-Reference.com
2014-15 NBA Season Summary | Basketball-Reference.com
2015-16 NBA Season Summary | Basketball-Reference.com
2016-17 NBA Season Summary | Basketball-Reference.com
2017-18 NBA Season Summary | Basketball-Reference.com
2018-19 NBA Season Summary | Basketball-Reference.com
2019-20 NBA Season Summary | Basketball-Reference.com
2020-21 NBA Season Summary | Basketball-Reference.com
2021-22 NBA Season Summary | Basketball-Reference.com
2022-23 NBA Season Summary | Basketball-Reference.com


Unnamed: 0,shooting_rank,shooting_team,shooting_games,shooting_minutes_played,shooting_field_goal_pct,shooting_average_shot_distance,shooting_,shooting_two_point_attempt_pct,shooting_fga_0_3_pct,shooting_fga_3_10_pct,...,shooting_dunks_made,shooting_.1,shooting_layups_fga_pct,shooting_layups_made,shooting_.2,shooting_corner_three_pct_fga,shooting_corner_three_fg_pct,shooting_.3,shooting_heaves_attempted,shooting_heaves_made
0,Rk,Team,G,MP,FG%,Dist.,,2P,0-3,3-10,...,Md.,,%FGA,Md.,,%3PA,3P%,,Att.,Md.
1,1,Atlanta Hawks*,82,19855,.464,13.4,,.714,.281,.152,...,323,,.222,924,,.264,.426,,12,0
2,2,Boston Celtics*,81,19840,.465,12.9,,.785,.287,.130,...,258,,.237,880,,.339,.399,,22,0
3,3,Brooklyn Nets*,82,19855,.450,12.6,,.731,.272,.203,...,249,,.231,873,,.349,.395,,12,0
4,4,Charlotte Bobcats,82,19805,.425,12.2,,.790,.319,.147,...,298,,.260,904,,.208,.402,,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,27,San Antonio Spurs,82,19855,.465,13.1,,.652,.258,.230,...,265,,.303,1339,,.216,.367,,18,0
359,28,Toronto Raptors,82,19805,.459,13.4,,.649,.243,.220,...,432,,.257,1082,,.264,.363,,22,0
360,29,Utah Jazz,82,19805,.473,14.2,,.579,.204,.266,...,460,,.274,1144,,.220,.413,,16,0
361,30,Washington Wizards,82,19755,.485,14.0,,.635,.211,.244,...,377,,.272,1118,,.215,.399,,16,2


## Download all PLAYER_STATS [NOT USED]

In [123]:
TYPE = "PLAYER_STATS"
PLAYER_STATS_COLUMNS = ['rank', 'player', 'age', 'team', 'position', 'games', 'games_started', 'minutes_played', 'field_goals',
 'field_goals_attempted', 'field_goal_percentage', '3_point_field_goals', '3_point_attempts',
 '3_point_percentage', '2_point_field_goals', '2_point_attempts', '2_point_percentage', 'effective_fg_pct',
 'free_throws', 'free_throw_attempts', 'free_throw_percentage', 'offensive_rebounds', 'defensive_rebounds',
 'total_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points', 'awards']

final_df = pd.DataFrame(columns=PLAYER_STATS_COLUMNS)

### Logic loop
for season in SEASONS:
    time.sleep(10)
    
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_totals.html"
    html = await get_html(url, "#totals_stats")
    soup = BeautifulSoup(html)
    all_rows = soup.find_all('tr')
    df = pd.DataFrame(columns=COLUMNS)
    
    for i in range(1, len(all_rows)):
        raw_row_data = all_rows[i]
        row_data = [cell.get_text(strip=True) for cell in raw_row_data.find_all(['th', 'td'])]
        if not (('Rk' in row_data) and ('Player' in row_data)):
            df.loc[len(df)] = row_data
    df.to_csv(f"data/{TYPE}/player_stats_{season}.csv")
    
    final_df = pd.concat([final_df, df], ignore_index=True)

final_df.to_csv(f"data/{TYPE}/player_stats_final.csv")

2013-14 NBA Player Stats: Totals | Basketball-Reference.com
2014-15 NBA Player Stats: Totals | Basketball-Reference.com
2015-16 NBA Player Stats: Totals | Basketball-Reference.com
2016-17 NBA Player Stats: Totals | Basketball-Reference.com
2017-18 NBA Player Stats: Totals | Basketball-Reference.com
2018-19 NBA Player Stats: Totals | Basketball-Reference.com
2019-20 NBA Player Stats: Totals | Basketball-Reference.com
2020-21 NBA Player Stats: Totals | Basketball-Reference.com
2021-22 NBA Player Stats: Totals | Basketball-Reference.com


In [122]:
df

Unnamed: 0,Rank,Player,Age,Team,Position,Games,Games Started,Minutes Played,Field Goals,Field Goals Attempted,...,Offensive Rebounds,Defensive Rebounds,Total Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Awards
0,1,Kevin Durant,25,OKC,SF,81,81,3122,849,1688,...,58,540,598,445,103,59,285,174,2593,"MVP-1,AS,NBA1"
1,2,Carmelo Anthony,29,NYK,PF,77,77,2982,743,1643,...,145,477,622,242,95,51,198,224,2112,"MVP-15,AS"
2,3,LeBron James,29,MIA,PF,77,77,2902,767,1353,...,81,452,533,488,121,26,270,126,2089,"MVP-2,DPOY-6,AS,NBA1"
3,4,Kevin Love,25,MIN,PF,77,77,2797,650,1421,...,224,739,963,341,59,35,196,136,2010,"MVP-11,AS,NBA2"
4,5,Blake Griffin,24,LAC,PF,80,80,2863,718,1359,...,192,565,757,309,92,51,224,265,1930,"MVP-3,AS,NBA2"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,479,Dexter Pittman,25,ATL,C,2,0,3,0,1,...,3,0,3,0,0,0,0,0,0,
608,480,Chris Smith,26,NYK,PG,2,0,2,0,0,...,0,0,0,0,0,0,0,0,0,
609,481,D.J. White,27,CHA,PF,2,0,10,0,1,...,0,2,2,0,1,0,0,1,0,
610,482,Royce White,22,SAC,PF,3,0,9,0,1,...,0,0,0,0,0,0,0,2,0,
