In [1]:
# GET ALL PLAYER STATS

import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict
import pickle
import requests
from unidecode import unidecode
import requests

def fix_name(full_name):
    first_name = full_name.split(' ')[0]
    if first_name == 'Peja':
        return 'Peja Stojakovic'
    elif first_name == 'Metta':
        return 'Ron Artest'
    else:
        return unidecode(full_name)

# 2008-2009 season: Allstar in 2009
# We are getting data from 2008-2009 to 2022-2023 season
FIRST_SEASON, LAST_SEASON = 2009, 2023

frames = {}
for y in range(FIRST_SEASON, LAST_SEASON + 1):
    year = y     
    frames["url" + str(y)] = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"

# read in data
num = 1
year = 2009
pl_frames = {}

for i in frames:
    pl_frames["pl" + str(num)] = pd.read_html(frames[i])[0]
    pl_frames["pl" + str(num)]['Season'] = year
    num += 1
    year += 1

# creating the data frame with all tables gathered from webpages
NBA_Player_DF = pd.concat(pl_frames)
NBA_Player_DF = NBA_Player_DF.drop_duplicates()
NBA_Player_DF['Player'] = NBA_Player_DF['Player'].apply(fix_name)
NBA_Player_DF.drop(NBA_Player_DF[NBA_Player_DF['Rk'] == 'Rk'].index, inplace = True)
NBA_Player_DF.reset_index(drop=True, inplace=True)

# export data

compression_opts = dict(method='zip',archive_name='NBA_Player_Stats.csv')  
NBA_Player_DF.to_csv('NBA_Player_Stats.zip', index=False,compression=compression_opts)

In [2]:
# GET ALLSTAR PLAYERS



# this dictionary will map players to a set containing all the years in which they were 
# selected for an all-star game, either initially or as a replacement
all_star_appearances = defaultdict(set)

# rows to ignore when iterating the roster tables
ignore_fields = set(['Team Totals', 'Reserves'])

# unidecode doesn't catch the accented c in Peja's last name (Stojakovic), fix it
# also overwrite any instance of Metta World Peace to Ron Artest


for year in range(FIRST_SEASON, LAST_SEASON + 1):

    print('Scraping ASG {} data...'.format(year))

    # will store all the all-stars for this year
    all_stars = set([])
    
    req = requests.Session()
    html = req.get('https://www.basketball-reference.com/allstar/NBA_{}.html'.format(year)).content
    soup = BeautifulSoup(html, 'html.parser')
    
    try:
        s1, s2 = soup.findAll('table')[1:3]
    except:
        print(soup)

    df1 = pd.read_html(str(s1))[0]
    df2 = pd.read_html(str(s2))[0]

    # get the all-stars from teams 1 and 2
    for df in [df1, df2]:
        for i, row in df.iterrows():
            if pd.notnull(row[0]) and row[0] not in ignore_fields:
                player = row[0]
                all_stars.add(fix_name(player))

    # gets all li elements in the page
    s3 = soup.findAll('li') 

    for s in s3:
        if 'Did not play' in str(s):
            for player in [name.get_text() for name in s.findAll('a')]: # all the injured players and their replacements
                all_stars.add(fix_name(player))
            break

    # update the appearances dictionary
    for player in all_stars:
        all_star_appearances[player].add(year)

sorted_all_star_appearances = sorted([(player, sorted(list(appearances))) for player, appearances in all_star_appearances.items()], key = lambda x : -len(x[1]))

print('\nAll all-star appearances since 2008 (sorted by number of appearances):\n')

for player, appearances in sorted_all_star_appearances:
    print('{}: {}'.format(player, appearances))


Scraping ASG 2009 data...
Scraping ASG 2010 data...
Scraping ASG 2011 data...
Scraping ASG 2012 data...
Scraping ASG 2013 data...
Scraping ASG 2014 data...
Scraping ASG 2015 data...
Scraping ASG 2016 data...
Scraping ASG 2017 data...
Scraping ASG 2018 data...
Scraping ASG 2019 data...
Scraping ASG 2020 data...
Scraping ASG 2021 data...
Scraping ASG 2022 data...
Scraping ASG 2023 data...

<!DOCTYPE html>

<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
<link href="https://cdn.ssref.net/req/202303021" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName


All all-star appearances since 2008 (sorted by number of appearances):

LeBron James: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
Kevin Durant: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021, 2022, 2023]
Chris Paul: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2020, 2021, 2022, 2023]
James Harden: [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
Dwyane Wade: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2019]
Russell Westbrook: [2011, 2012, 2013, 2015, 2016, 2017, 2018, 2019, 2020]
Stephen Curry: [2014, 2015, 2016, 2017, 2018, 2019, 2021, 2022, 2023]
Chris Bosh: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
Kobe Bryant: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
Carmelo Anthony: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
Anthony Davis: [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
Dirk Nowitzki: [2009, 2010, 2011, 2012, 2014, 2015, 2019]
LaMarcus Aldridge: [2012, 2013, 2014, 2015, 20

In [8]:
from collections import defaultdict

# print(sorted_all_star_appearances)
all_star_appearances = list()

for allstar in sorted_all_star_appearances:
    player = allstar[0]
    years = allstar[1]
    for y in years:
        if y != 2008:
            all_star_appearances.append((player, y))
        
print(len(all_star_appearances))

player_stats = pd.read_csv("NBA_Player_Stats_2009-2023.csv")

# print(player_stats)


player_stats["All Star"] = 0

all_star_player_index = list()


player_stats['Player_Season'] = list(zip(player_stats['Player'], player_stats['Season']))
player_stats.loc[player_stats['Player_Season'].isin(all_star_appearances), 'All Star'] = 1
player_stats.drop('Player_Season', axis=1, inplace=True)
print(player_stats["All Star"].value_counts())
print(player_stats)

# player_stats.to_csv('player_stats_with_allstar.csv')

NameError: name 'sorted_all_star_appearances' is not defined

In [9]:
import pandas as pd
import numpy as np


player_stats_as = pd.read_csv("player_stats_with_allstar.csv")

aggregated_data = player_stats_as.groupby(['Player', 'Season'], as_index=False).agg({
    'All Star': lambda x: x.mode()[0],
    'Age': lambda x: x.mode()[0],
    'Pos': lambda x: x.mode()[0],
    'Tm': lambda x: player_stats_as.loc[x.index, 'G'].value_counts().index[0],
    'G': 'sum',
    'GS' : 'sum',
    'MP' : 'sum',
    'FG' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'FGA' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'FG%' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    '3P' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    '3PA' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    '3P%' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    '2P' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    '2PA' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    '2P%' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'eFG%' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'FT' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'FTA' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'FT%' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'ORB' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'DRB' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'TRB' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'AST' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'STL' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'BLK' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'TOV' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'PF' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
    'PTS' : lambda x: np.average(x, weights=player_stats_as.loc[x.index, 'G']),
})

print(aggregated_data)

            Player  Season  All Star  Age Pos  Tm   G  GS    MP   FG  ...  \
0       A.J. Green    2023         0   23  SG  29  29   0   9.8  1.7  ...   
1     A.J. Hammons    2017         0   24   C  22  22   0   7.4  0.8  ...   
2      A.J. Lawson    2023         0   22  SG  13  26   0  11.9  1.0  ...   
3       A.J. Price    2010         0   23  PG  56  56   2  15.4  2.6  ...   
4       A.J. Price    2011         0   24  PG  50  50   0  15.9  2.3  ...   
...            ...     ...       ...  ...  ..  ..  ..  ..   ...  ...  ...   
7469     Ömer Aşık    2014         0   27   C  48  48  19  20.2  2.1  ...   
7470     Ömer Aşık    2015         0   28   C  76  76  76  26.1  2.8  ...   
7471     Ömer Aşık    2016         0   29   C  68  68  64  17.3  1.5  ...   
7472     Ömer Aşık    2017         0   30   C  31  31  19  15.5  1.0  ...   
7473     Ömer Aşık    2018         0   31   C  18  36   0  34.0  0.5  ...   

         FT%       ORB       DRB       TRB       AST       STL       BLK  \

In [12]:
aggregated_data = aggregated_data.dropna()
print(aggregated_data)
aggregated_data.to_csv('aggregated_data.csv')

                  Player  Season  All Star  Age Pos  Tm   G  GS    MP   FG  \
0             A.J. Green    2023         0   23  SG  29  29   0   9.8  1.7   
1           A.J. Hammons    2017         0   24   C  22  22   0   7.4  0.8   
3             A.J. Price    2010         0   23  PG  56  56   2  15.4  2.6   
4             A.J. Price    2011         0   24  PG  50  50   0  15.9  2.3   
5             A.J. Price    2012         0   25  PG  44  44   1  12.9  1.3   
...                  ...     ...       ...  ...  ..  ..  ..  ..   ...  ...   
7459  Zydrunas Ilgauskas    2011         0   35   C  72  72  51  15.9  2.3   
7462        Álex Abrines    2017         0   23  SG  68  68   6  15.5  2.0   
7463        Álex Abrines    2018         0   24  SG  75  75   8  15.1  1.5   
7464        Álex Abrines    2019         0   25  SG  31  31   2  19.0  1.8   
7468           Ömer Aşık    2013         0   26   C  82  82  82  30.0  4.0   

      ...    FT%  ORB  DRB   TRB  AST  STL  BLK  TOV   PF   PTS