In [1]:
from unidecode import unidecode
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
def extract_player_stats_pg(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    response = requests.get(url)
    if response.status_code != 200:
        return response.status_code
    
    soup = BeautifulSoup(response.text)
    table_header = soup.find('table', {'id': 'per_game_stats'}).find('thead')
    header = [row.text for row in table_header.find_all('th')]

    table_body = soup.find('table', {'id': 'per_game_stats'}).find('tbody')
    rows = table_body.find_all('tr', {'class': ['full_table', 'italic_text partial_table']})
    players = []
    for row in rows:
        player_data = [stat.text for stat in row.find_all(['td', 'th'])]
        players.append(player_data)

    df_player_stats_pg = pd.DataFrame(players)
    df_player_stats_pg.columns = header

    df_player_stats_pg.replace('', '0', inplace = True)
    df_player_stats_pg = df_player_stats_pg.apply(pd.to_numeric, errors = 'ignore')

    df_player_stats_pg['Player'] = df_player_stats_pg['Player'].str.strip('*')

    df_player_stats_pg['Season'] = year
    
    return df_player_stats_pg
        
    

In [3]:
extract_player_stats_pg(1984)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,1,Kareem Abdul-Jabbar,C,36,LAL,80,80,32.8,9.0,15.5,...,2.1,5.2,7.3,2.6,0.7,1.8,2.8,2.6,21.5,1984
1,2,Alvan Adams,C,29,PHO,70,13,20.7,3.8,8.3,...,1.7,2.9,4.6,3.1,1.0,0.4,1.7,2.8,9.6,1984
2,3,Mark Aguirre,SF,24,DAL,79,79,36.7,11.7,22.3,...,2.0,3.9,5.9,4.5,1.0,0.3,3.6,3.1,29.5,1984
3,4,Danny Ainge,SG,24,BOS,71,3,16.3,2.3,5.1,...,0.4,1.2,1.6,2.3,0.6,0.1,1.0,2.0,5.4,1984
4,5,J.J. Anderson,SF,23,UTA,48,0,6.5,1.1,2.7,...,0.8,0.5,1.3,0.5,0.3,0.2,0.4,0.6,2.5,1984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,306,Randy Wittman,SG,24,ATL,78,1,13.7,2.1,4.1,...,0.2,0.7,0.9,0.9,0.2,0.0,0.4,1.1,4.5,1984
339,307,Al Wood,SG,25,SEA,81,81,27.6,5.8,11.7,...,1.2,2.2,3.4,2.0,0.8,0.4,1.6,2.6,14.3,1984
340,308,Mike Woodson,SG,25,KCK,71,12,25.9,5.5,11.5,...,0.9,1.6,2.5,2.5,1.2,0.4,1.6,2.5,14.5,1984
341,309,Orlando Woolridge,SF,24,CHI,75,74,33.9,7.6,14.5,...,1.7,3.2,4.9,1.8,0.9,0.8,2.5,3.4,19.3,1984


In [4]:
def extract_player_stats_totals(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    response = requests.get(url)
    if response.status_code != 200:
        return response.status_code
    
    soup = BeautifulSoup(response.text)
    table_header = soup.find('table', {'id': 'totals_stats'}).find('thead')
    header = [row.text for row in table_header.find_all('th')]

    table_body = soup.find('table', {'id': 'totals_stats'}).find('tbody')
    rows = table_body.find_all('tr', {'class': ['full_table', 'italic_text partial_table']})
    players = []
    for row in rows:
        player_data = [stat.text for stat in row.find_all(['td', 'th'])]
        players.append(player_data)

    df_player_stats_totals = pd.DataFrame(players)
    df_player_stats_totals.columns = header

    df_player_stats_totals.replace('', '0', inplace = True)

    df_player_stats_totals = df_player_stats_totals.apply(pd.to_numeric, errors = 'ignore')

    df_player_stats_totals['Player'] = df_player_stats_totals['Player'].str.strip('*')

    df_player_stats_totals['Season'] = year
    
    return df_player_stats_totals

In [5]:
extract_player_stats_totals(1984)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,1,Kareem Abdul-Jabbar,C,36,LAL,80,80,2622,716,1238,...,169,418,587,211,55,143,221,211,1717,1984
1,2,Alvan Adams,C,29,PHO,70,13,1452,269,582,...,118,201,319,219,73,31,117,195,670,1984
2,3,Mark Aguirre,SF,24,DAL,79,79,2900,925,1765,...,161,308,469,358,80,22,285,246,2330,1984
3,4,Danny Ainge,SG,24,BOS,71,3,1154,166,361,...,29,87,116,162,41,4,70,143,384,1984
4,5,J.J. Anderson,SF,23,UTA,48,0,311,55,130,...,38,25,63,22,15,9,20,28,122,1984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,306,Randy Wittman,SG,24,ATL,78,1,1071,160,318,...,14,57,71,71,17,0,32,82,350,1984
339,307,Al Wood,SG,25,SEA,81,81,2236,467,945,...,94,181,275,166,64,32,126,207,1160,1984
340,308,Mike Woodson,SG,25,KCK,71,12,1838,389,816,...,62,113,175,175,83,28,115,174,1027,1984
341,309,Orlando Woolridge,SF,24,CHI,75,74,2544,570,1086,...,130,239,369,136,71,60,188,253,1444,1984


In [6]:
def extract_player_stats_advanced(year):
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html'
    response = requests.get(url)
    if response.status_code != 200:
        return response.status_code
    
    soup = BeautifulSoup(response.text)
    table_header = soup.find('table', {'id': 'advanced_stats'}).find('thead')
    header = [row.text for row in table_header.find_all('th')]

    table_body = soup.find('table', {'id': 'advanced_stats'}).find('tbody')
    rows = table_body.find_all('tr', {'class': ['full_table', 'italic_text partial_table']})
    players = []
    for row in rows:
        player_data = [stat.text for stat in row.find_all(['td', 'th'])]
        players.append(player_data)

    df_player_stats_advanced = pd.DataFrame(players)
    df_player_stats_advanced.columns = header

    df_player_stats_advanced.drop(columns = '\xa0', inplace = True)

    df_player_stats_advanced.replace('', '0', inplace = True)

    df_player_stats_advanced = df_player_stats_advanced.apply(pd.to_numeric, errors = 'ignore')

    df_player_stats_advanced['Player'] = df_player_stats_advanced['Player'].str.strip('*')

    df_player_stats_advanced['Season'] = year
    
    return df_player_stats_advanced

In [7]:
extract_player_stats_advanced(1984)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Season
0,1,Kareem Abdul-Jabbar,C,36,LAL,80,2622,21.3,0.608,0.001,...,25.1,5.9,3.1,8.9,0.163,2.8,0.2,3.0,3.3,1984
1,2,Alvan Adams,C,29,PHO,70,1452,16.6,0.513,0.007,...,21.7,1.5,1.8,3.3,0.109,0.7,0.9,1.6,1.3,1984
2,3,Mark Aguirre,SF,24,DAL,79,2900,23.5,0.572,0.032,...,33.4,7.0,2.0,9.0,0.149,5.0,-1.2,3.7,4.1,1984
3,4,Danny Ainge,SG,24,BOS,71,1154,10.4,0.498,0.061,...,16.2,0.5,1.3,1.8,0.076,-2.2,0.4,-1.8,0.0,1984
4,5,J.J. Anderson,SF,23,UTA,48,311,11.6,0.427,0.023,...,20.8,-0.2,0.3,0.1,0.013,-2.1,-1.0,-3.1,-0.1,1984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,306,Randy Wittman,SG,24,ATL,78,1071,9.3,0.517,0.016,...,14.9,0.6,0.6,1.2,0.054,-2.0,-0.9,-3.0,-0.3,1984
339,307,Al Wood,SG,25,SEA,81,2236,14.9,0.545,0.022,...,22.1,2.7,1.7,4.4,0.094,-0.2,-0.7,-0.9,0.6,1984
340,308,Mike Woodson,SG,25,KCK,71,1838,15.9,0.541,0.010,...,23.3,2.6,1.6,4.2,0.110,0.4,0.1,0.5,1.2,1984
341,309,Orlando Woolridge,SF,24,CHI,75,2544,15.9,0.567,0.002,...,23.8,3.0,2.3,5.3,0.100,0.7,-0.6,0.1,1.4,1984


In [8]:
def extract_mvp_votes(year):
    url = f'https://www.basketball-reference.com/awards/awards_{year}.html#mvp'
    response = requests.get(url)
    if response.status_code != 200:
        return response.status_code

    soup = BeautifulSoup(response.text)
    table_body = soup.find('table', {'id': 'mvp'}).find('tbody')
    rows = table_body.find_all('tr')

    mvp_votes = []
    for row in rows:
        player_data = []
        player = row.find('td').find('a').text
        player_data.append(player)
        votes = row.find('td', {'data-stat': 'points_won'}).text
        player_data.append(int(float(votes)))
        percentage = row.find('td', {'data-stat': 'award_share'}).text
        player_data.append(float(percentage))
        max = row.find('td', {'data-stat': 'points_max'}).text
        player_data.append(int(max))
        mvp_votes.append(player_data)
        
    df_mvp_votes = pd.DataFrame(mvp_votes, columns = ['Player', 'Votes', 'Share', 'MaxVotes'])

    df_mvp_votes['Season'] = year

    return df_mvp_votes

In [9]:
extract_mvp_votes(1984)

Unnamed: 0,Player,Votes,Share,MaxVotes,Season
0,Larry Bird,652,0.858,760,1984
1,Bernard King,373,0.491,760,1984
2,Magic Johnson,305,0.401,760,1984
3,Kareem Abdul-Jabbar,153,0.201,760,1984
4,Isiah Thomas,115,0.151,760,1984
5,Julius Erving,98,0.129,760,1984
6,Adrian Dantley,88,0.116,760,1984
7,Sidney Moncrief,70,0.092,760,1984
8,Jeff Ruland,38,0.05,760,1984
9,Moses Malone,35,0.046,760,1984


In [10]:
def get_team_record(team, year):
    url = f'https://www.basketball-reference.com/teams/{team}/{year}.html'
    response = requests.get(url)
    if response.status_code != 200:
        return response.status_code

    soup = BeautifulSoup(response.text)
    page_body = soup.find('div', {'data-template': 'Partials/Teams/Summary'})
    record = page_body.find('p').text.split()[1].strip(',')
    return record
    

In [62]:
def get_season_records(df): 
    year = df['Season'].unique()[0]
    teams = list(df['Tm'].unique())
    teams.remove('TOT')
    df_season_records = pd.DataFrame(teams, columns = ['Tm'])
    df_season_records['Record'] = df_season_records['Tm'].apply(get_team_record, args = (year,))
    df_season_records['W'] = df_season_records['Record'].apply(lambda x: x.split('-')[0]).astype(int)
    df_season_records['L'] = df_season_records['Record'].apply(lambda x: x.split('-')[1]).astype(int)
    df_season_records['%W'] = round(df_season_records['W'] / (df_season_records['W'] + df_season_records['L']), 3)
    df_season_records['GT'] = df_season_records['W'] + df_season_records['L']
    df_season_records.drop(columns = ['Record', 'W', 'L'], inplace = True)
    return df_season_records

In [49]:
get_season_records(extract_player_stats_pg(1984))

Unnamed: 0,Tm,PCT,GT
0,LAL,0.659,82
1,PHO,0.5,82
2,DAL,0.524,82
3,BOS,0.756,82
4,UTA,0.549,82
5,DEN,0.463,82
6,MIL,0.61,82
7,DET,0.598,82
8,CLE,0.341,82
9,HOU,0.354,82


In [50]:
df1 = extract_player_stats_pg(1984)
df2 = get_season_records(df1)

In [63]:
def fillna_tot_team(series, df):
    if pd.isna(series['%W']):
        tot_gp = series['G']
        sub_df = df[(df['Player'] == series['Player']) & (df['Tm'] != 'TOT')]
        new_pct = round(((sub_df['G'] * sub_df['%W']).sum() / tot_gp), 3)
        series['%W'] = new_pct
        series['GT'] = sub_df['GT'].max()
    return series

In [57]:
df3 = pd.merge(left = df1, right = df2, how = 'left', on = 'Tm')

In [58]:
df4 = df3.apply(fillna_tot_team, axis = 1, args = (df3,))

In [59]:
df4[df4['Player'] == 'Keith Edmonson']

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,Season,PCT,GT
75,72,Keith Edmonson,SG,23,TOT,55,0,11.3,2.9,5.8,...,1.6,0.6,0.5,0.1,1.1,1.5,7.5,1984,0.454,82.0
76,72,Keith Edmonson,SG,23,SAS,40,0,13.0,3.4,6.9,...,1.8,0.7,0.6,0.2,1.1,1.7,8.7,1984,0.451,82.0
77,72,Keith Edmonson,SG,23,DEN,15,0,6.7,1.5,3.1,...,1.2,0.5,0.3,0.1,1.1,1.1,4.3,1984,0.463,82.0


In [72]:
def extract_player_stats(year, mvp = True, team_stats = True, advanced = False, ranks = False):
    df_pg = extract_player_stats_pg(year)
    df_tot = extract_player_stats_totals(year)

    df_return = pd.merge(left = df_pg, right = df_tot, how = 'inner', on = ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'FG%', '3P%', '2P%', 'eFG%', 'FT%', 'Season'], suffixes = ['_pg', '_tot'])

    if advanced:
        df_adv = extract_player_stats_advanced(year)
        df_return = pd.merge(left = df_return, right = df_adv, how = 'inner', on = ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'Season']).drop(columns = 'MP')
    
    if team_stats:
        df_records = get_season_records(df_return)
        df_return = pd.merge(left = df_return, right = df_records, how = 'left', on = 'Tm')
        df_return = df_return.apply(fillna_tot_team, axis = 1, args = (df_return, ))
        df_return['GT'] = df_return['GT'].astype(int)
        df_return['%G'] = round(df_return['G'] / df_return ['GT'], 3)
    
    if ranks:
        if team_stats:
            df_ranks = df_return.loc[:, 'MP_pg': '%W'].rank(method = 'dense')
        elif advanced:
            df_ranks = df_return.loc[:, 'MP_pg': 'VORP'].rank(method = 'dense')
        else:
            df_ranks = df_return.loc[:, 'MP_pg': 'PTS_tot'].rank(method = 'dense')
        df_ranks.drop(columns = 'Season', inplace = True)
        df_ranks = df_ranks.add_suffix('_rank')
        df_return = df_return.join(df_ranks)

    df_return['%GS'] = round(df_return['GS'] / df_return['G'], 3)   

    if mvp:
        df_mvp = extract_mvp_votes(year)
        df_return = pd.merge(left = df_return, right = df_mvp, how = 'left', on = ['Player', 'Season'])
        df_return.fillna({'Votes': 0, 'Share': 0, 'MaxVotes': 0}, inplace = True)
        df_return[['Votes', 'MaxVotes']] = df_return[['Votes', 'MaxVotes']].astype(int)
        
    return df_return

In [70]:
extract_player_stats(2000, ranks = True).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496 entries, 0 to 495
Data columns (total 98 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rk            496 non-null    int64  
 1   Player        496 non-null    object 
 2   Pos           496 non-null    object 
 3   Age           496 non-null    int64  
 4   Tm            496 non-null    object 
 5   G             496 non-null    int64  
 6   GS            496 non-null    int64  
 7   MP_pg         496 non-null    float64
 8   FG_pg         496 non-null    float64
 9   FGA_pg        496 non-null    float64
 10  FG%           496 non-null    float64
 11  3P_pg         496 non-null    float64
 12  3PA_pg        496 non-null    float64
 13  3P%           496 non-null    float64
 14  2P_pg         496 non-null    float64
 15  2PA_pg        496 non-null    float64
 16  2P%           496 non-null    float64
 17  eFG%          496 non-null    float64
 18  FT_pg         496 non-null    

In [71]:
def extract_player_stats_seasons(years, mvp = True, team_stats = True, advanced = False, ranks = False):
    list_dfs = []
    for year in years:
        list_dfs.append(extract_player_stats(year, mvp, team_stats, advanced, ranks))
    return pd.concat(list_dfs, ignore_index = True)

In [74]:
final_df = extract_player_stats_seasons(range(1980, 2022), advanced = True, ranks = True)

In [81]:
final_df.to_pickle('dataframes/complete_stats.pkl')