In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time


## Get the data on rookies and its stats

In [2]:
url = "https://www.basketball-reference.com/leagues/NBA_2024_rookies-season-stats.html"

headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}

pageTree = requests.get(url, headers=headers)
soup = BeautifulSoup(pageTree.content, 'html.parser')

table = soup.find("table", id="rookies")

data = pd.read_html(str(table), header=1)[0]

data = data.rename(columns={"MP.1": "MP/G", 'PTS.1': "PTS/G", "TRB.1": "TRB/G", "AST.1": "AST/G"})

# Remove mid-table header rows
data = data[data['Rk'].notna()]
data = data[~data['Rk'].str.contains('Rk')]

data

Unnamed: 0,Rk,Player,Debut,Age,Yrs,G,MP,FG,FGA,3P,...,TOV,PF,PTS,FG%,3P%,FT%,MP/G,PTS/G,TRB/G,AST/G
0,1,Ibou Badji,"Dec 26, '23, POR vs. SAC",21,1,1,2,0,0,0,...,0,0,0,,,,2.0,0.0,0.0,0.0
1,2,Amari Bailey,"Nov 12, '23, CHO @ NYK",19,1,2,11,2,6,0,...,1,0,5,.333,.000,.500,5.5,2.5,1.0,0.0
2,3,Emoni Bates,"Oct 28, '23, CLE vs. IND",20,1,9,70,6,22,5,...,1,3,18,.273,.294,.500,7.8,2.0,1.0,0.3
3,4,Jules Bernard,"Dec 8, '23, WAS @ BRK",24,1,2,10,1,5,0,...,1,1,2,.200,.000,,5.0,1.0,0.5,0.5
4,5,Onuralp Bitim,"Nov 4, '23, CHI @ DEN",24,1,1,3,0,0,0,...,0,0,0,,,,3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,74,Jarace Walker,"Oct 25, '23, IND vs. WAS",20,1,8,84,12,28,3,...,3,11,30,.429,.273,1.000,10.5,3.8,1.5,1.4
80,75,Cason Wallace,"Oct 25, '23, OKC @ CHI",20,1,28,612,75,136,29,...,15,46,191,.551,.453,.800,21.9,6.8,2.1,1.5
81,76,Victor Wembanyama,"Oct 25, '23, SAS vs. DAL",20,1,26,779,179,413,35,...,87,64,477,.433,.278,.771,30.0,18.3,10.6,2.8
82,77,Cam Whitmore,"Oct 25, '23, HOU @ ORL",19,1,6,46,13,27,2,...,2,4,34,.481,.182,.500,7.7,5.7,2.0,0.3


In [3]:
def extract_rookies_data(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_rookies-season-stats.html"

    headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }

    pageTree = requests.get(url, headers=headers)
    soup = BeautifulSoup(pageTree.content, 'html.parser')

    table = soup.find("table", id="rookies")

    rookies = pd.read_html(str(table), header=1)[0]

    rookies = rookies.rename(columns={"MP.1": "MP/G", 'PTS.1': "PTS/G", "TRB.1": "TRB/G", "AST.1": "AST/G"})

    # Remove mid-table header rows
    rookies = rookies[rookies['Rk'].notna()]
    rookies = rookies[~rookies['Rk'].str.contains('Rk')]

    return rookies

In [13]:
def extract_rookies_data(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_rookies-season-stats.html"

    try:
        pageTree = requests.get(url, headers=headers)

        pageTree.raise_for_status()

        soup = BeautifulSoup(pageTree.content, 'html.parser')
        table = soup.find("table", id="rookies")

        if table is not None:
            rookies = pd.read_html(str(table), header=1)[0]

            rookies = rookies.rename(columns={"MP.1": "MP/G", 'PTS.1': "PTS/G", "TRB.1": "TRB/G", "AST.1": "AST/G"})

            # Remove mid-table header rows
            rookies = rookies[rookies['Rk'].notna()]
            rookies = rookies[~rookies['Rk'].str.contains('Rk')]

            return rookies
        
        else:
            print(f"No 'rookies' table found for the year {year}")
            return pd.DataFrame()  

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None  
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None  # or handle the general exception as needed
    
rookies = extract_rookies_data(2019)
rookies

Unnamed: 0,Rk,Player,Debut,Age,Yrs,G,MP,FG,FGA,3P,...,TOV,PF,PTS,FG%,3P%,FT%,MP/G,PTS/G,TRB/G,AST/G
0,1,Jaylen Adams,"Oct 17, '18, ATL @ NYK",22,1,34,428,38,110,25,...,28,45,108,.345,.338,.778,12.6,3.2,1.8,1.9
1,2,Deng Adel,"Jan 19, '19, CLE @ DEN",21,1,19,194,11,36,6,...,6,13,32,.306,.261,1.000,10.2,1.7,1.0,0.3
2,3,DeVaughn Akoon-Purcell,"Oct 23, '18, DEN vs. SAC",25,1,7,22,3,10,0,...,2,4,7,.300,.000,.500,3.1,1.0,0.6,0.9
3,4,Rawle Alkins,"Dec 17, '18, CHI @ OKC",21,1,10,120,13,39,3,...,8,7,37,.333,.250,.667,12.0,3.7,2.6,1.3
4,5,Grayson Allen,"Oct 22, '18, UTA vs. MEM",23,1,38,416,67,178,32,...,33,47,211,.376,.323,.750,10.9,5.6,0.6,0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,101,Thomas Welsh,"Oct 17, '18, DEN @ LAC",22,1,11,36,7,13,3,...,1,3,18,.538,.429,.500,3.3,1.6,0.4,0.5
111,102,Johnathan Williams,"Oct 22, '18, LAL vs. SAS",23,1,24,372,65,110,0,...,16,62,157,.591,.000,.563,15.5,6.5,4.1,0.5
112,103,Kenrich Williams,"Oct 17, '18, NOP @ HOU",24,1,46,1079,107,279,52,...,36,95,279,.384,.333,.684,23.5,6.1,4.8,1.8
113,104,Robert Williams,"Oct 22, '18, BOS vs. ORL",21,1,32,283,36,51,0,...,10,36,81,.706,,.600,8.8,2.5,2.5,0.2


In [11]:
years = [year for year in range(2010, 2024)]
table = []

for year in tqdm(years):
    print(f"Extracting data for {year}")
    roy_awards = extract_roy_awards(year)
    if roy_awards is not None:
        roy_awards['year'] = year
        table.append(roy_awards)

    time.sleep(3)

table

  0%|          | 0/14 [00:00<?, ?it/s]

Extracting data for 2010


  7%|▋         | 1/14 [00:03<00:44,  3.43s/it]

Extracting data for 2011


 14%|█▍        | 2/14 [00:06<00:40,  3.40s/it]

Extracting data for 2012


 21%|██▏       | 3/14 [00:10<00:37,  3.41s/it]

Extracting data for 2013


 29%|██▊       | 4/14 [00:13<00:34,  3.41s/it]

Extracting data for 2014


 36%|███▌      | 5/14 [00:16<00:30,  3.39s/it]

Extracting data for 2015


 43%|████▎     | 6/14 [00:20<00:27,  3.40s/it]

Extracting data for 2016


 50%|█████     | 7/14 [00:23<00:23,  3.41s/it]

Extracting data for 2017


 57%|█████▋    | 8/14 [00:27<00:20,  3.39s/it]

Extracting data for 2018


 64%|██████▍   | 9/14 [00:30<00:16,  3.40s/it]

Extracting data for 2019


 71%|███████▏  | 10/14 [00:33<00:13,  3.38s/it]

Extracting data for 2020


 79%|███████▊  | 11/14 [00:37<00:10,  3.36s/it]

Extracting data for 2021


 86%|████████▌ | 12/14 [00:40<00:06,  3.37s/it]

Extracting data for 2022


 93%|█████████▎| 13/14 [00:43<00:03,  3.36s/it]

Extracting data for 2023


100%|██████████| 14/14 [00:47<00:00,  3.38s/it]


[  Rank            Player  Age   Tm  First  Pts Won  Pts Max  Share   G    MP  \
 0    1      Tyreke Evans   20  SAC   67.0    491.0      615  0.798  72  37.2   
 1    2     Stephen Curry   21  GSW   43.0    391.0      615  0.636  80  36.2   
 2    3  Brandon Jennings   20  MIL   12.0    204.0      615  0.332  82  32.6   
 3    4   Darren Collison   22  NOH    1.0     17.0      615  0.028  76  27.8   
 4   5T       Jonny Flynn   20  MIN    0.0      2.0      615  0.003  81  28.9   
 5   5T        Taj Gibson   24  CHI    0.0      2.0      615  0.003  82  26.9   
 
    ...  TRB  AST  STL  BLK    FG%    3P%    FT%   WS  WS/48  year  
 0  ...  5.3  5.8  1.5  0.4  0.458  0.255  0.748  5.4  0.097  2010  
 1  ...  4.5  5.9  1.9  0.2  0.462  0.437  0.885  4.7  0.077  2010  
 2  ...  3.4  5.7  1.3  0.2  0.371  0.374  0.817  4.2  0.075  2010  
 3  ...  2.5  5.7  1.0  0.1  0.477  0.400  0.851  2.9  0.067  2010  
 4  ...  2.4  4.4  1.0  0.0  0.417  0.358  0.826  0.1  0.002  2010  
 5  ...  7.5  0.9

In [4]:
data.columns
data.dtypes

Rk        object
Player    object
Debut     object
Age       object
Yrs       object
G         object
MP        object
FG        object
FGA       object
3P        object
3PA       object
FT        object
FTA       object
ORB       object
TRB       object
AST       object
STL       object
BLK       object
TOV       object
PF        object
PTS       object
FG%       object
3P%       object
FT%       object
MP/G      object
PTS/G     object
TRB/G     object
AST/G     object
dtype: object

In [5]:
pd.set_option('display.max_rows', None)
data = data.rename(columns={"MP.1": "MP/G", 'PTS.1': "PTS/G", "TRB.1": "TRB/G", "AST.1": "AST/G"})

# Remove mid-table header rows
data = data[data['Rk'].notna()]
data = data[~data['Rk'].str.contains('Rk')]



## Get the advanced stats of the year

In [7]:
url2 = "https://www.basketball-reference.com/leagues/NBA_2024_advanced.html"

pageTree = requests.get(url2)
soup = BeautifulSoup(pageTree.content, 'html.parser')

table = soup.find("table", id="advanced_stats")

advanced_stats = pd.read_html(str(table))[0]

advanced_stats.drop(['Unnamed: 19', 'Unnamed: 24'], axis=1, inplace=True)
advanced_stats = advanced_stats[~advanced_stats['Rk'].str.contains('Rk')]

advanced_stats



Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,1,Precious Achiuwa,C,24,TOR,21,387,16.2,0.533,0.278,...,13.9,21.3,0.2,0.5,0.7,0.086,-0.6,0.4,-0.2,0.2
1,2,Bam Adebayo,C,26,MIA,18,603,21.3,0.578,0.01,...,11.4,28.3,0.8,0.9,1.7,0.135,1.1,1.0,2.1,0.6
2,3,Ochai Agbaji,SG,23,UTA,29,623,10.3,0.575,0.57,...,10.2,13.1,0.4,0.3,0.7,0.055,-1.7,-0.5,-2.2,0.0
3,4,Santi Aldama,PF,23,MEM,21,526,14.7,0.545,0.488,...,9.4,20.6,0.2,0.7,0.9,0.085,0.5,0.4,0.9,0.4
4,5,Nickeil Alexander-Walker,SG,25,MIN,27,624,9.5,0.539,0.653,...,14.3,14.2,0.1,1.1,1.2,0.089,-2.0,2.1,0.1,0.3
5,6,Grayson Allen,SG,28,PHO,22,731,12.4,0.655,0.578,...,13.1,13.9,1.1,0.6,1.6,0.107,-0.4,-0.6,-1.0,0.2
6,7,Jarrett Allen,C,25,CLE,24,696,18.8,0.69,0.0,...,16.3,16.8,1.5,1.0,2.5,0.169,0.2,1.0,1.2,0.6
7,8,Jose Alvarado,PG,25,NOP,15,254,13.3,0.544,0.548,...,13.1,17.9,0.1,0.4,0.5,0.09,-1.2,1.5,0.4,0.2
8,9,Kyle Anderson,PF,30,MIN,27,621,12.0,0.511,0.135,...,18.2,15.4,0.1,1.2,1.3,0.102,-1.8,2.8,0.9,0.5
9,10,Giannis Antetokounmpo,PF,29,MIL,27,940,29.0,0.644,0.094,...,13.8,33.6,3.0,1.4,4.4,0.224,5.5,1.6,7.1,2.2


In [8]:
advanced_stats.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr',
       'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [17]:
def extract_advanced_stats(year):
    """
    Extracts advanced stats from basketball-reference for the year
    """
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"

    try:
        pageTree = requests.get(url, headers=headers)

        pageTree.raise_for_status()

        soup = BeautifulSoup(pageTree.content, 'html.parser')
        table = soup.find("table", id="advanced_stats")

        if table is not None:
            advanced_stats = pd.read_html(str(table))[0]

            #Drop unwanted columns
            advanced_stats.drop(['Unnamed: 19', 'Unnamed: 24'], axis=1, inplace=True)
            advanced_stats = advanced_stats[~advanced_stats['Rk'].str.contains('Rk')]

            return advanced_stats

        else:
            print(f"No 'advanced_stats' table found for the year {year}")
            return pd.DataFrame()
        
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None  
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None  # or handle the general exception as needed


In [18]:
years = [year for year in range(2010, 2024)]
table = []

for year in tqdm(years):
    print(f"Extracting advanced_stats for {year}")
    advanced_stats = extract_advanced_stats(year)
    
    if advanced_stats is not None:
        advanced_stats['year'] = year
        table.append(advanced_stats)

    time.sleep(3)

df = pd.concat(table)
df

  0%|          | 0/14 [00:00<?, ?it/s]

Extracting advanced_stats for 2010


  7%|▋         | 1/14 [00:04<00:59,  4.60s/it]

Extracting advanced_stats for 2011


 14%|█▍        | 2/14 [00:09<00:53,  4.49s/it]

Extracting advanced_stats for 2012


 21%|██▏       | 3/14 [00:13<00:48,  4.37s/it]

Extracting advanced_stats for 2013


 29%|██▊       | 4/14 [00:17<00:44,  4.42s/it]

Extracting advanced_stats for 2014


 36%|███▌      | 5/14 [00:22<00:40,  4.48s/it]

Extracting advanced_stats for 2015


 43%|████▎     | 6/14 [00:26<00:35,  4.44s/it]

Extracting advanced_stats for 2016


 50%|█████     | 7/14 [00:31<00:31,  4.46s/it]

Extracting advanced_stats for 2017


 57%|█████▋    | 8/14 [00:35<00:26,  4.46s/it]

Extracting advanced_stats for 2018


 64%|██████▍   | 9/14 [00:40<00:22,  4.46s/it]

Extracting advanced_stats for 2019


 71%|███████▏  | 10/14 [00:44<00:17,  4.46s/it]

Extracting advanced_stats for 2020


 79%|███████▊  | 11/14 [00:48<00:13,  4.43s/it]

Extracting advanced_stats for 2021


 86%|████████▌ | 12/14 [00:53<00:09,  4.61s/it]

Extracting advanced_stats for 2022


 93%|█████████▎| 13/14 [00:58<00:04,  4.62s/it]

Extracting advanced_stats for 2023


100%|██████████| 14/14 [01:03<00:00,  4.51s/it]


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,year
0,1,Arron Afflalo,SG,24,DEN,82,2221,10.9,.576,.426,...,14.0,2.8,1.4,4.3,.092,-0.2,-0.2,-0.4,0.9,2010
1,2,Alexis Ajinça,C,21,CHA,6,30,6.3,.479,.000,...,19.3,-0.1,0.0,0.0,-0.013,-6.3,1.0,-5.3,0.0,2010
2,3,LaMarcus Aldridge,PF,24,POR,78,2922,18.2,.535,.014,...,22.9,5.5,3.3,8.8,.145,1.4,-0.2,1.2,2.3,2010
3,4,Joe Alexander,SF,23,CHI,8,29,2.8,.273,.167,...,11.3,0.0,0.0,0.0,.030,-9.1,0.9,-8.3,0.0,2010
4,5,Malik Allen,PF,31,DEN,51,456,5.9,.431,.052,...,14.0,-0.3,0.3,0.1,.009,-4.7,-1.0,-5.7,-0.4,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,535,Thaddeus Young,PF,34,TOR,54,795,14.1,.573,.172,...,13.5,0.7,1.1,1.8,.109,-1.8,1.9,0.1,0.4,2023
701,536,Trae Young,PG,24,ATL,73,2541,22.0,.573,.331,...,32.6,5.3,1.4,6.7,.126,5.3,-2.0,3.3,3.4,2023
702,537,Omer Yurtseven,C,24,MIA,9,83,16.7,.675,.259,...,18.0,0.2,0.1,0.3,.159,-2.5,-1.5,-3.9,0.0,2023
703,538,Cody Zeller,C,30,MIA,15,217,16.4,.659,.034,...,18.1,0.4,0.3,0.7,.147,-2.0,-0.7,-2.8,0.0,2023


## Get ROY voting data

In [9]:
url3 = "https://www.basketball-reference.com/awards/awards_2023.html"

pageTree = requests.get(url3)
soup = BeautifulSoup(pageTree.content, 'html.parser')

table = soup.find("table", id="roy")

roy_voting = pd.read_html(str(table), header=1)[0]

roy_voting


Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Paolo Banchero,20,ORL,98.0,494.0,500,0.988,72,33.8,20.0,6.9,3.7,0.8,0.5,0.427,0.298,0.738,2.4,0.047
1,2,Jalen Williams,21,OKC,0.0,241.0,500,0.482,75,30.3,14.1,4.5,3.3,1.4,0.5,0.521,0.356,0.812,5.6,0.119
2,3,Walker Kessler,21,UTA,2.0,114.0,500,0.228,74,23.0,9.2,8.4,0.9,0.4,2.3,0.72,0.333,0.516,7.1,0.2
3,4,Bennedict Mathurin,20,IND,0.0,27.0,500,0.054,78,28.5,16.7,4.1,1.5,0.6,0.2,0.434,0.323,0.828,1.8,0.038
4,5,Keegan Murray,22,SAC,0.0,21.0,500,0.042,80,29.8,12.2,4.6,1.2,0.8,0.5,0.453,0.411,0.765,4.3,0.087
5,6,Jaden Ivey,20,DET,0.0,3.0,500,0.006,74,31.1,16.3,3.9,5.2,0.8,0.2,0.416,0.343,0.747,0.0,-0.001


In [14]:
def extract_roy_awards(year):
    """
    Extracts rookie of the year votes and statistics from basketball-reference for a specific year
    """
    
    url = f"https://www.basketball-reference.com/awards/awards_{year}.html"

    headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }
    
    try:
        pageTree = requests.get(url, headers=headers)

        pageTree.raise_for_status()

        soup = BeautifulSoup(pageTree.content, 'html.parser')
        table = soup.find("table", id="roy")

        if table is not None:
            roy_voting = pd.read_html(str(table), header=1)[0]
            return roy_voting
        
        else:
            print(f"No 'roy' table found for the year {year}")
            return pd.DataFrame()  

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None  
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None  # or handle the general exception as needed


In [16]:
def historical_roy_awards(start_year, end_year):

    years = [year for year in range(start_year, end_year)]
    table = []

    for year in tqdm(years):
        print(f"Extracting ROY data for {year}")
        roy_awards = extract_roy_awards(year)
        if roy_awards is not None:
            roy_awards['year'] = year
            table.append(roy_awards)

        time.sleep(3)

    roy = pd.concat(table)

    return roy

roy_awards = historical_roy_awards(1979, 2024)
roy_awards


  0%|          | 0/45 [00:00<?, ?it/s]

Extracting ROY data for 1979


  2%|▏         | 1/45 [00:03<02:30,  3.43s/it]

Extracting ROY data for 1980


  4%|▍         | 2/45 [00:06<02:24,  3.37s/it]

Extracting ROY data for 1981


  7%|▋         | 3/45 [00:10<02:20,  3.36s/it]

Extracting ROY data for 1982


  9%|▉         | 4/45 [00:13<02:17,  3.35s/it]

Extracting ROY data for 1983


 11%|█         | 5/45 [00:16<02:15,  3.38s/it]

Extracting ROY data for 1984


 13%|█▎        | 6/45 [00:20<02:11,  3.37s/it]

Extracting ROY data for 1985


 16%|█▌        | 7/45 [00:23<02:06,  3.34s/it]

Extracting ROY data for 1986


 18%|█▊        | 8/45 [00:26<02:04,  3.36s/it]

Extracting ROY data for 1987


 20%|██        | 9/45 [00:30<02:00,  3.35s/it]

Extracting ROY data for 1988


 22%|██▏       | 10/45 [00:33<01:57,  3.36s/it]

Extracting ROY data for 1989


 24%|██▍       | 11/45 [00:37<01:54,  3.38s/it]

Extracting ROY data for 1990


 27%|██▋       | 12/45 [00:40<01:51,  3.38s/it]

Extracting ROY data for 1991


 29%|██▉       | 13/45 [00:43<01:49,  3.41s/it]

Extracting ROY data for 1992


 31%|███       | 14/45 [00:47<01:45,  3.40s/it]

Extracting ROY data for 1993


 33%|███▎      | 15/45 [00:50<01:43,  3.44s/it]

Extracting ROY data for 1994


 36%|███▌      | 16/45 [00:54<01:39,  3.43s/it]

Extracting ROY data for 1995


 38%|███▊      | 17/45 [00:57<01:36,  3.44s/it]

Extracting ROY data for 1996


 40%|████      | 18/45 [01:01<01:32,  3.42s/it]

Extracting ROY data for 1997


 42%|████▏     | 19/45 [01:04<01:29,  3.43s/it]

Extracting ROY data for 1998


 44%|████▍     | 20/45 [01:07<01:25,  3.41s/it]

Extracting ROY data for 1999


 47%|████▋     | 21/45 [01:11<01:21,  3.40s/it]

Extracting ROY data for 2000


 49%|████▉     | 22/45 [01:14<01:17,  3.39s/it]

Extracting ROY data for 2001


 51%|█████     | 23/45 [01:18<01:14,  3.40s/it]

Extracting ROY data for 2002


 53%|█████▎    | 24/45 [01:21<01:11,  3.40s/it]

Extracting ROY data for 2003


 56%|█████▌    | 25/45 [01:24<01:08,  3.40s/it]

Extracting ROY data for 2004


 58%|█████▊    | 26/45 [01:28<01:04,  3.39s/it]

Extracting ROY data for 2005


 60%|██████    | 27/45 [01:31<01:00,  3.38s/it]

Extracting ROY data for 2006


 62%|██████▏   | 28/45 [01:34<00:57,  3.37s/it]

Extracting ROY data for 2007


 64%|██████▍   | 29/45 [01:38<00:53,  3.36s/it]

Extracting ROY data for 2008


 67%|██████▋   | 30/45 [01:41<00:50,  3.37s/it]

Extracting ROY data for 2009


 69%|██████▉   | 31/45 [01:45<00:47,  3.38s/it]

Extracting ROY data for 2010


 71%|███████   | 32/45 [01:48<00:43,  3.37s/it]

Extracting ROY data for 2011


 73%|███████▎  | 33/45 [01:51<00:40,  3.38s/it]

Extracting ROY data for 2012


 76%|███████▌  | 34/45 [01:55<00:37,  3.39s/it]

Extracting ROY data for 2013


 78%|███████▊  | 35/45 [01:58<00:34,  3.40s/it]

Extracting ROY data for 2014


 80%|████████  | 36/45 [02:02<00:31,  3.48s/it]

Extracting ROY data for 2015


 82%|████████▏ | 37/45 [02:06<00:28,  3.57s/it]

Extracting ROY data for 2016


 84%|████████▍ | 38/45 [02:09<00:24,  3.53s/it]

Extracting ROY data for 2017


 87%|████████▋ | 39/45 [02:12<00:20,  3.48s/it]

Extracting ROY data for 2018


 89%|████████▉ | 40/45 [02:16<00:17,  3.45s/it]

Extracting ROY data for 2019


 91%|█████████ | 41/45 [02:19<00:13,  3.44s/it]

Extracting ROY data for 2020


 93%|█████████▎| 42/45 [02:23<00:10,  3.42s/it]

Extracting ROY data for 2021


 96%|█████████▌| 43/45 [02:26<00:06,  3.40s/it]

Extracting ROY data for 2022


 98%|█████████▊| 44/45 [02:29<00:03,  3.45s/it]

Extracting ROY data for 2023


100%|██████████| 45/45 [02:33<00:00,  3.41s/it]


Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year
0,1,Phil Ford,22,KCK,62.0,62.0,66,0.939,79,34.5,...,2.3,8.6,2.2,0.1,0.465,,0.813,5.7,0.101,1979
1,2,Reggie Theus,21,CHI,4.0,4.0,66,0.061,82,33.6,...,2.8,5.2,1.1,0.2,0.480,,0.761,3.2,0.056,1979
0,1,Larry Bird,23,BOS,63.0,63.0,66,0.955,82,36.0,...,10.4,4.5,1.7,0.6,0.474,0.406,0.836,11.2,0.182,1980
1,2,Magic Johnson,20,LAL,3.0,3.0,66,0.045,77,36.3,...,7.7,7.3,2.4,0.5,0.530,0.226,0.810,10.5,0.180,1980
0,1,Darrell Griffith,22,UTA,19.0,19.0,69,0.275,81,35.4,...,3.6,2.4,1.3,0.5,0.464,0.192,0.716,0.4,0.006,1981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2,Jalen Williams,21,OKC,0.0,241.0,500,0.482,75,30.3,...,4.5,3.3,1.4,0.5,0.521,0.356,0.812,5.6,0.119,2023
2,3,Walker Kessler,21,UTA,2.0,114.0,500,0.228,74,23.0,...,8.4,0.9,0.4,2.3,0.720,0.333,0.516,7.1,0.200,2023
3,4,Bennedict Mathurin,20,IND,0.0,27.0,500,0.054,78,28.5,...,4.1,1.5,0.6,0.2,0.434,0.323,0.828,1.8,0.038,2023
4,5,Keegan Murray,22,SAC,0.0,21.0,500,0.042,80,29.8,...,4.6,1.2,0.8,0.5,0.453,0.411,0.765,4.3,0.087,2023


## Get the standings data

In [100]:
url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html#all_confs_standings_E"

pageTree = requests.get(url)
soup = BeautifulSoup(pageTree.content, 'html.parser')

table_east = soup.find("table", id="confs_standings_E")
table_west = soup.find("table", id="confs_standings_W")

east_standings = pd.read_html(str(table_east))[0]
east_standings['Conference'] = "East"
east_standings.rename(columns={'Eastern Conference': 'team'}, inplace=True)

west_standings = pd.read_html(str(table_west))[0]
west_standings.rename(columns={'Western Conference': 'team'}, inplace=True)
west_standings['Conference'] = "West"

standings = pd.concat([east_standings, west_standings], axis=0).reset_index(drop=True)

regex_pattern = r'(.+?)\s*\((\d+)\)'
standings[['TEAM_NAME', 'SEED']] = standings['team'].str.extract(regex_pattern)


## Get team abbreviation
a_tags = soup.select('table.stats_table a')
team_abbreviation = [a['href'].split('/')[2] for a in a_tags]
team_names = [a.text for a in a_tags]
df_teams = pd.DataFrame({'TEAM_ABBREVIATION': team_abbreviation, 'TEAM_NAME': team_names})
df_teams.drop_duplicates(inplace=True)

standings = pd.merge(standings, df_teams, on="TEAM_NAME")
standings.drop(columns=['team'], inplace=True)
standings


Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference,TEAM_NAME,SEED,TEAM_ABBREVIATION
0,21,6,0.778,—,118.9,109.8,10.4,East,Boston Celtics,1,BOS
1,21,7,0.75,0.5,124.4,119.0,3.95,East,Milwaukee Bucks,2,MIL
2,19,8,0.704,2.0,122.3,111.0,10.51,East,Philadelphia 76ers,3,PHI
3,16,11,0.593,5.0,113.0,110.4,2.46,East,Orlando Magic,4,ORL
4,16,11,0.593,5.0,114.7,111.4,2.86,East,New York Knicks,5,NYK
5,16,12,0.571,5.5,112.9,111.8,-0.2,East,Miami Heat,6,MIA
6,16,13,0.552,6.0,112.1,112.4,0.73,East,Cleveland Cavaliers,7,CLE
7,14,13,0.519,7.0,127.1,126.1,0.01,East,Indiana Pacers,8,IND
8,13,14,0.481,8.0,115.6,115.3,0.5,East,Brooklyn Nets,9,BRK
9,12,15,0.444,9.0,123.1,122.9,0.68,East,Atlanta Hawks,10,ATL


In [22]:
def extract_standings(year):
    """
    Extracts the NBA standings from basketball-reference for a specific year
    """
    
    url = f"https://www.basketball-reference.com/awards/awards_{year}.html"

    headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }
    
    try:
        url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html#all_confs_standings_E"

        pageTree = requests.get(url, headers=headers)

        pageTree.raise_for_status()
        
        soup = BeautifulSoup(pageTree.content, 'html.parser')

        table_east = soup.find("table", id="confs_standings_E")
        table_west = soup.find("table", id="confs_standings_W")

        if table_east is not None:
            east_standings = pd.read_html(str(table_east))[0]
            east_standings['Conference'] = "East"
            east_standings.rename(columns={'Eastern Conference': 'team'}, inplace=True)
        
        elif table_west is not None:
            west_standings = pd.read_html(str(table_west))[0]
            west_standings.rename(columns={'Western Conference': 'team'}, inplace=True)
            west_standings['Conference'] = "West"

        else:
            print(f"No 'standings' table found for the year {year}")
            return pd.DataFrame()
        
        standings = pd.concat([east_standings, west_standings], axis=0).reset_index(drop=True)

        regex_pattern = r'(.+?)\s*\((\d+)\)'
        standings[['TEAM_NAME', 'SEED']] = standings['team'].str.extract(regex_pattern)

        ## Get team abbreviation from the a tags in the page
        a_tags = soup.select('table.stats_table a')
        
        if a_tags is not None:
            team_abbreviation = [a['href'].split('/')[2] for a in a_tags]
            team_names = [a.text for a in a_tags]
            df_teams = pd.DataFrame({'TEAM_ABBREVIATION': team_abbreviation, 'TEAM_NAME': team_names})
            df_teams.drop_duplicates(inplace=True)

            standings = pd.merge(standings, df_teams, on="TEAM_NAME")

            return standings

        else:
            print(f"No 'a_tags' found in page for the year {year}")
            return standings

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None  
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None  # or handle the general exception as needed

In [23]:
years = [year for year in range(2010, 2024)]
table = []

for year in tqdm(years, desc="Processing years"):
    print(f"Extracting standings data for {year}")
    standings = extract_standings(year)
    if standings is not None:
        standings['year'] = year
        table.append(standings)

    time.sleep(3)

historical_standings = pd.concat(table)

Processing years:   0%|          | 0/14 [00:00<?, ?it/s]

Extracting standings data for 2010
An error occurred: local variable 'west_standings' referenced before assignment


Processing years:   7%|▋         | 1/14 [00:03<00:42,  3.24s/it]

Extracting standings data for 2011
An error occurred: local variable 'west_standings' referenced before assignment


Processing years:   7%|▋         | 1/14 [00:06<01:18,  6.07s/it]


KeyboardInterrupt: 

In [71]:
a_tags = soup.select('table.stats_table a')
team_abbreviation = [a['href'].split('/')[2] for a in a_tags]
team_names = [a.text for a in a_tags]
df_teams = pd.DataFrame({'TEAM_ABBREVIATION': team_abbreviation, 'TEAM_NAME': team_names})
df_teams.drop_duplicates(inplace=True)
df_teams

Unnamed: 0,TEAM_ABBREVIATION,TEAM_NAME
0,BOS,Boston Celtics
1,MIL,Milwaukee Bucks
2,PHI,Philadelphia 76ers
3,ORL,Orlando Magic
4,NYK,New York Knicks
5,MIA,Miami Heat
6,CLE,Cleveland Cavaliers
7,IND,Indiana Pacers
8,BRK,Brooklyn Nets
9,ATL,Atlanta Hawks


In [96]:
url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html#all_confs_standings_E%22"

pageTree = requests.get(url)
soup = BeautifulSoup(pageTree.content, 'html.parser')

table_east = soup.find("table", id="confs_standings_E")
table_west = soup.find("table", id="confs_standings_W")

east_standings = pd.read_html(str(table_east))[0]
east_standings['Conference'] = "East"
east_standings.rename(columns={'Eastern Conference': 'Team Name'}, inplace=True)

west_standings = pd.read_html(str(table_west))[0]
west_standings.rename(columns={'Western Conference': 'Team Name'}, inplace=True)
west_standings['Conference'] = "West"

standings = pd.concat([east_standings, west_standings], axis=0).reset_index(drop=True)

In [97]:
standings

Unnamed: 0,Team Name,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference
0,Boston Celtics (1),21,6,0.778,—,118.9,109.8,10.4,East
1,Milwaukee Bucks (2),21,7,0.75,0.5,124.4,119.0,3.95,East
2,Philadelphia 76ers (3),19,8,0.704,2.0,122.3,111.0,10.51,East
3,Orlando Magic (4),16,11,0.593,5.0,113.0,110.4,2.46,East
4,New York Knicks (5),16,11,0.593,5.0,114.7,111.4,2.86,East
5,Miami Heat (6),16,12,0.571,5.5,112.9,111.8,-0.2,East
6,Cleveland Cavaliers (7),16,13,0.552,6.0,112.1,112.4,0.73,East
7,Indiana Pacers (8),14,13,0.519,7.0,127.1,126.1,0.01,East
8,Brooklyn Nets (9),13,14,0.481,8.0,115.6,115.3,0.5,East
9,Atlanta Hawks (10),12,15,0.444,9.0,123.1,122.9,0.68,East


In [20]:
url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html#all_confs_standings_E"



[<a href="/teams/BOS/2024.html">Boston Celtics</a>,
 <a href="/teams/MIL/2024.html">Milwaukee Bucks</a>,
 <a href="/teams/PHI/2024.html">Philadelphia 76ers</a>,
 <a href="/teams/MIA/2024.html">Miami Heat</a>,
 <a href="/teams/ORL/2024.html">Orlando Magic</a>,
 <a href="/teams/CLE/2024.html">Cleveland Cavaliers</a>,
 <a href="/teams/NYK/2024.html">New York Knicks</a>,
 <a href="/teams/IND/2024.html">Indiana Pacers</a>,
 <a href="/teams/BRK/2024.html">Brooklyn Nets</a>,
 <a href="/teams/CHI/2024.html">Chicago Bulls</a>,
 <a href="/teams/TOR/2024.html">Toronto Raptors</a>,
 <a href="/teams/ATL/2024.html">Atlanta Hawks</a>,
 <a href="/teams/CHO/2024.html">Charlotte Hornets</a>,
 <a href="/teams/WAS/2024.html">Washington Wizards</a>,
 <a href="/teams/DET/2024.html">Detroit Pistons</a>,
 <a href="/teams/MIN/2024.html">Minnesota Timberwolves</a>,
 <a href="/teams/OKC/2024.html">Oklahoma City Thunder</a>,
 <a href="/teams/DEN/2024.html">Denver Nuggets</a>,
 <a href="/teams/LAC/2024.html">Los A

In [105]:
filter_roy = ['Player', 
           'Tm', 
           'Share', 
           'G', 
           'MP',
           'PTS',
           'TRB',
           'AST',
           'STL',
           'BLK',
           'FG%',
           '3P%',
           'FT%',
           'WS',
           'WS/48']

roy = roy_voting[filter_roy]
roy

Unnamed: 0,Player,Tm,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,Paolo Banchero,ORL,0.988,72,33.8,20.0,6.9,3.7,0.8,0.5,0.427,0.298,0.738,2.4,0.047
1,Jalen Williams,OKC,0.482,75,30.3,14.1,4.5,3.3,1.4,0.5,0.521,0.356,0.812,5.6,0.119
2,Walker Kessler,UTA,0.228,74,23.0,9.2,8.4,0.9,0.4,2.3,0.72,0.333,0.516,7.1,0.2
3,Bennedict Mathurin,IND,0.054,78,28.5,16.7,4.1,1.5,0.6,0.2,0.434,0.323,0.828,1.8,0.038
4,Keegan Murray,SAC,0.042,80,29.8,12.2,4.6,1.2,0.8,0.5,0.453,0.411,0.765,4.3,0.087
5,Jaden Ivey,DET,0.006,74,31.1,16.3,3.9,5.2,0.8,0.2,0.416,0.343,0.747,0.0,-0.001


In [104]:
roy_voting

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Paolo Banchero,20,ORL,98.0,494.0,500,0.988,72,33.8,20.0,6.9,3.7,0.8,0.5,0.427,0.298,0.738,2.4,0.047
1,2,Jalen Williams,21,OKC,0.0,241.0,500,0.482,75,30.3,14.1,4.5,3.3,1.4,0.5,0.521,0.356,0.812,5.6,0.119
2,3,Walker Kessler,21,UTA,2.0,114.0,500,0.228,74,23.0,9.2,8.4,0.9,0.4,2.3,0.72,0.333,0.516,7.1,0.2
3,4,Bennedict Mathurin,20,IND,0.0,27.0,500,0.054,78,28.5,16.7,4.1,1.5,0.6,0.2,0.434,0.323,0.828,1.8,0.038
4,5,Keegan Murray,22,SAC,0.0,21.0,500,0.042,80,29.8,12.2,4.6,1.2,0.8,0.5,0.453,0.411,0.765,4.3,0.087
5,6,Jaden Ivey,20,DET,0.0,3.0,500,0.006,74,31.1,16.3,3.9,5.2,0.8,0.2,0.416,0.343,0.747,0.0,-0.001


In [126]:
pd.set_option('display.max_columns', None)

roy_votes = extract_roy_awards(2022)
advanced_stats = extract_advanced_stats(2022)
standings = extract_team_standings(2022)

#Merge dataframes and remove duplicate columns
master_table = pd.merge(roy_votes, advanced_stats, how='left', on='Player', suffixes=('', '_remove'))
master_table.drop([column for column in master_table.columns if 'remove' in column], axis=1, inplace=True)


#Merge team standings to the master_table
master_table = pd.merge(master_table, standings, how='left', on='Tm', suffixes=('', '_remove'))
master_table.drop([column for column in master_table.columns if 'remove' in column], axis=1, inplace=True)
master_table

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Rk,Pos,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,OBPM,DBPM,BPM,VORP,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference,TEAM_NAME,SEED
0,1,Scottie Barnes,20,TOR,48.0,378.0,500,0.756,74,35.4,15.3,7.5,3.5,1.1,0.7,0.492,0.301,0.735,6.6,0.122,32,PF,16.3,0.552,0.207,0.231,7.7,15.8,11.5,14.7,1.5,2.1,11.7,19.0,3.7,2.9,0.5,0.4,0.9,1.9,11,16,0.407,10.0,112.2,114.1,-1.54,East,Toronto Raptors,12
1,2,Evan Mobley,20,CLE,43.0,363.0,500,0.726,69,33.8,15.0,8.3,2.5,0.8,1.7,0.508,0.25,0.663,5.2,0.107,384,PF,16.1,0.549,0.111,0.304,6.9,19.8,13.5,11.6,1.2,4.4,12.4,20.4,1.5,3.7,-0.9,1.5,0.6,1.5,16,13,0.552,6.0,112.1,112.4,0.73,East,Cleveland Cavaliers,7
2,3,Cade Cunningham,20,DET,9.0,153.0,500,0.306,64,32.6,17.4,5.5,5.6,1.2,0.7,0.416,0.314,0.845,-0.5,-0.011,124,SG,13.1,0.504,0.352,0.163,2.9,16.2,9.2,29.1,1.8,1.9,17.5,27.5,-2.4,1.9,-1.1,-0.5,-1.6,0.2,2,26,0.071,19.5,109.0,120.7,-11.08,East,Detroit Pistons,15
3,4,Jalen Green,19,HOU,0.0,3.0,500,0.006,67,31.9,17.3,3.4,2.6,0.7,0.3,0.426,0.343,0.797,0.7,0.015,205,SG,12.5,0.547,0.48,0.247,1.7,10.0,5.8,13.1,1.0,0.8,11.3,23.7,0.5,0.2,-0.5,-2.4,-2.9,-0.5,13,12,0.52,7.0,111.4,108.4,3.08,West,Houston Rockets,8
4,5,Franz Wagner,20,ORL,0.0,2.0,500,0.004,79,30.7,15.2,4.5,2.9,0.9,0.4,0.468,0.354,0.863,4.0,0.079,559,SF,14.7,0.559,0.278,0.225,3.6,11.9,7.7,15.7,1.4,1.3,10.0,21.2,2.1,1.9,-0.4,-0.3,-0.7,0.8,16,11,0.593,5.0,113.0,110.4,2.46,East,Orlando Magic,4
5,6,Herbert Jones,23,NOP,0.0,1.0,500,0.002,78,29.9,9.5,3.8,2.1,1.7,0.8,0.476,0.337,0.84,4.6,0.096,294,PF,12.3,0.573,0.293,0.283,4.5,9.6,7.0,10.0,2.7,2.5,13.4,13.7,2.1,2.5,-2.0,1.5,-0.5,0.9,17,12,0.586,5.0,116.1,113.5,2.21,West,New Orleans Pelicans,7


In [112]:
roy_votes

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Scottie Barnes,20,TOR,48.0,378.0,500,0.756,74,35.4,15.3,7.5,3.5,1.1,0.7,0.492,0.301,0.735,6.6,0.122
1,2,Evan Mobley,20,CLE,43.0,363.0,500,0.726,69,33.8,15.0,8.3,2.5,0.8,1.7,0.508,0.25,0.663,5.2,0.107
2,3,Cade Cunningham,20,DET,9.0,153.0,500,0.306,64,32.6,17.4,5.5,5.6,1.2,0.7,0.416,0.314,0.845,-0.5,-0.011
3,4,Jalen Green,19,HOU,0.0,3.0,500,0.006,67,31.9,17.3,3.4,2.6,0.7,0.3,0.426,0.343,0.797,0.7,0.015
4,5,Franz Wagner,20,ORL,0.0,2.0,500,0.004,79,30.7,15.2,4.5,2.9,0.9,0.4,0.468,0.354,0.863,4.0,0.079
5,6,Herbert Jones,23,NOP,0.0,1.0,500,0.002,78,29.9,9.5,3.8,2.1,1.7,0.8,0.476,0.337,0.84,4.6,0.096


In [122]:
standings

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference,TEAM_NAME,SEED,TEAM_ABBREVIATION
0,21,6,0.778,—,118.9,109.8,10.4,East,Boston Celtics,1,BOS
1,21,7,0.75,0.5,124.4,119.0,3.95,East,Milwaukee Bucks,2,MIL
2,19,8,0.704,2.0,122.3,111.0,10.51,East,Philadelphia 76ers,3,PHI
3,16,11,0.593,5.0,113.0,110.4,2.46,East,Orlando Magic,4,ORL
4,16,11,0.593,5.0,114.7,111.4,2.86,East,New York Knicks,5,NYK
5,16,12,0.571,5.5,112.9,111.8,-0.2,East,Miami Heat,6,MIA
6,16,13,0.552,6.0,112.1,112.4,0.73,East,Cleveland Cavaliers,7,CLE
7,14,13,0.519,7.0,127.1,126.1,0.01,East,Indiana Pacers,8,IND
8,13,14,0.481,8.0,115.6,115.3,0.5,East,Brooklyn Nets,9,BRK
9,12,15,0.444,9.0,123.1,122.9,0.68,East,Atlanta Hawks,10,ATL


In [11]:
url = "https://www.basketball-reference.com/leagues/NBA_1981_standings.html"

pageTree = requests.get(url)
soup = BeautifulSoup(pageTree.content, 'html.parser')

table_east = soup.find("table", id="divs_standings_E")
table_west = soup.find("table", id="divs_standings_W")

east_standings = pd.read_html(str(table_east))[0]
east_standings['Conference'] = "East"
east_standings.rename(columns={'Eastern Conference': 'TEAM_NAME'}, inplace=True)

west_standings = pd.read_html(str(table_west))[0]
west_standings.rename(columns={'Western Conference': 'TEAM_NAME'}, inplace=True)
west_standings['Conference'] = "West"

standings = pd.concat([east_standings, west_standings], axis=0).reset_index(drop=True)
standings['TEAM_NAME'] = standings['TEAM_NAME'].str.replace('*', '', regex=False)

a_tags = soup.select('table.stats_table a')
        
if a_tags is not None:
    team_abbreviation = [a['href'].split('/')[2] for a in a_tags]
    team_names = [a.text for a in a_tags]
    df_teams = pd.DataFrame({'TEAM_ABBREVIATION': team_abbreviation, 'TEAM_NAME': team_names})
    df_teams.drop_duplicates(inplace=True)

standings = pd.merge(standings, df_teams, how='inner', on="TEAM_NAME")
standings

Unnamed: 0,TEAM_NAME,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference,TEAM_ABBREVIATION
0,Boston Celtics,62,20,0.756,—,109.9,104.0,6.05,East,BOS
1,Philadelphia 76ers,62,20,0.756,—,111.7,103.8,7.76,East,PHI
2,New York Knicks,50,32,0.61,12.0,107.9,106.3,2.0,East,NYK
3,Washington Bullets,39,43,0.476,23.0,105.6,105.6,0.42,East,WSB
4,New Jersey Nets,24,58,0.293,38.0,106.9,113.0,-5.15,East,NJN
5,Milwaukee Bucks,60,22,0.732,—,113.1,105.9,7.14,East,MIL
6,Chicago Bulls,45,37,0.549,15.0,109.0,107.0,2.34,East,CHI
7,Indiana Pacers,44,38,0.537,16.0,107.6,106.2,1.72,East,IND
8,Atlanta Hawks,31,51,0.378,29.0,104.9,108.0,-2.37,East,ATL
9,Cleveland Cavaliers,28,54,0.341,32.0,105.7,110.6,-4.15,East,CLE


In [4]:
table_east

<table class="suppress_all sortable stats_table" data-cols-to-freeze=",1" id="divs_standings_E">
<caption>Division Standings Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr>
<th aria-label="Eastern Conference" class="poptip sort_default_asc left" data-stat="team_name" scope="col">Eastern Conference</th>
<th aria-label="Wins" class="poptip right" data-stat="wins" data-tip="Wins" scope="col">W</th>
<th aria-label="Losses" class="poptip right" data-stat="losses" data-tip="Losses" scope="col">L</th>
<th aria-label="Win-Loss Percentage" class="poptip right" data-stat="win_loss_pct" data-tip="Win-Loss Percentage" scope="col">W/L%</th>
<th aria-label="GB" class="poptip sort_default_asc right" data-stat="gb" data-tip="Games Behind" scope="col">GB</th>
<th aria-label="Points Per Game" class="poptip right" data-stat="pts_per_g" data-tip="Points Per Game" scope="col">PS/G</th>
<th aria-label="Opponent Points Per Game" class="poptip right" data-sta

In [14]:
headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }

def extract_advanced_stats(year):
    """
    Extracts advanced stats from basketball-reference for the year
    """
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"

    try:
        pageTree = requests.get(url, headers=headers)

        pageTree.raise_for_status()

        soup = BeautifulSoup(pageTree.content, 'html.parser')
        table = soup.find("table", id="advanced_stats")

        if table is not None:
            advanced_stats = pd.read_html(str(table))[0]

            #Drop unwanted columns
            advanced_stats.drop(['Unnamed: 19', 'Unnamed: 24'], axis=1, inplace=True)
            advanced_stats = advanced_stats[~advanced_stats['Rk'].str.contains('Rk')]

            #Remove the star for players in the Hall of Fame
            advanced_stats['Player'] = advanced_stats['Player'].str.replace('*', '')

            return advanced_stats

        else:
            print(f"No 'advanced_stats' table found for the year {year}")
            return pd.DataFrame()
                  
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None  
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None  # or handle the general exception as needed 

stats = extract_advanced_stats(1993)
stats[stats['Player'].str.contains('Shaquille')]


  advanced_stats['Player'] = advanced_stats['Player'].str.replace('*', '')


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
291,252,Shaquille O'Neal,C,20,ORL,81,3071,22.9,0.584,0.002,...,15.9,27.0,4.5,6.0,10.4,0.163,2.0,1.5,3.5,4.3
