In [1]:
#import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2023.html'

per_game = requests.get(url)

#create BeautifulSoup object
soup = BeautifulSoup(per_game.content, 'html.parser')

#locate correct table
table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="per_game-team") 
rows = table.findAll(lambda tag: tag.name=='tr')

#create DataFrame
df = pd.read_html(str(table))[0]

#clean DataFrame
df = df.drop(index=30)
df.insert(2, "Year", 2023, True)
df = df.drop(columns=["G","Rk"])
df = df.reset_index(drop=True)

#add advanced stats to DataFrame
table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="advanced-team") 
rows = table.findAll(lambda tag: tag.name=='tr')
df_adv = pd.read_html(str(table))[0]
df_adv.columns = df_adv.columns.droplevel()
df_adv = df_adv.drop(index=30)
df_adv['Team'] = df_adv['Team'].str.replace('*', '')
df_adv = df_adv.drop(columns=["Rk", "L", "PW", "PL", "Unnamed: 17_level_1", "Unnamed: 22_level_1", "Unnamed: 27_level_1", "Arena"])
df_adv = df_adv.reset_index(drop=True)

df_team = pd.merge(df, df_adv, on='Team')

  df_adv['Team'] = df_adv['Team'].str.replace('*', '')


In [3]:
df_team.columns.values[40] = "Op_eFG%"
df_team.insert(46, "Playoff_W", 0)

In [4]:
#write data to csv
df_team.to_csv('team_per_game_2023.csv')

In [5]:
url = 'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'
url_adv = 'https://www.basketball-reference.com/leagues/NBA_2023_advanced.html'

per_game = requests.get(url)
adv = requests.get(url_adv)

#create BeautifulSoup object
soup = BeautifulSoup(per_game.content, 'html.parser')

#locate correct table
table = soup.find("table", class_="sortable stats_table")
rows = table.findAll(lambda tag: tag.name=='tr')

#create DataFrame
df = pd.read_html(str(table))[0]

#clean DataFrame
df = df.drop(index=30)
df = df.drop(columns=["G","Rk", "Tm", "GS"])
df = df.reset_index(drop=True)

per_game = requests.get(url)

#create BeautifulSoup object
soup = BeautifulSoup(adv.content, 'html.parser')

#add advanced stats to DataFrame
table = soup.find("table", class_="sortable stats_table") 
rows = table.findAll(lambda tag: tag.name=='tr')
df_adv = pd.read_html(str(table))[0]
df_adv = df_adv.drop(columns=["Rk", "G", "Tm", "MP", "Pos", "Age", "Unnamed: 19", "Unnamed: 24"])
df_adv = df_adv.reset_index(drop=True)

df_player = pd.merge(df, df_adv, on='Player')

In [6]:
#drop junk rows
df_player = df_player.drop(df_player[df_player.Player == "Player"].index)

# reformat non-latin Player characters
df_player = df_player.replace({'Player': {'Luka DonÄiÄ‡': 'Luka Doncic', 'Alperen Şengün': 'Alperen Sengun', 
'Boban Marjanović': 'Boban Marjanovic', 'Bojan Bogdanović': 'Bojan Bogdanovic', 'Bogdan Bogdanović': 'Bogdan Bogdanovic', 
'Dāvis Bertāns': 'Davis Bertans', 'Dario Šarić': 'Dario Saric', 'Dennis Schröder': 'Dennis Schroeder', 'Goran DragiÄ‡': 'Goran Dragic', 
'Jonas ValanÄiÅ«nas': 'Jonas Valanciunas', 'Juancho HernangÃ³mez': 'Juan Hernangomez', 'Jusuf NurkiÄ‡': 'Jusuf Nurkic', 
'Kristaps PorziÅ†Ä£is': 'Kristaps Porzingis', 'Moussa DiabatÃ©': 'Moussa Diabate', 'Nikola JokiÄ‡': 'Nikola Jokic', 
'Nikola VuÄeviÄ‡': 'Nikola Vucevic', 'Nikola JoviÄ‡': 'Nikola Jovic', 'ThÃ©o Maledon': 'Theo Maledon', 'Vlatko ÄŒanÄar': 'Vlatko Cancar', 
'Willy HernangÃ³mez': 'Willy Hernangomez'}})

# drop duplicate players (players traded mid-season. NOTE: this approach should be altered if significant 
# players are traded later on in the season)
df_player = df_player.drop_duplicates(subset="Player", keep='first').reset_index(drop=True)

In [23]:
df_player.loc[[396]]

Unnamed: 0,Player,Pos,Age,MP,FG,FGA,FG%,3P,3PA,3P%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
396,Dennis Schröder,PG,29,27.2,3.7,8.3,0.443,1.0,3.0,0.349,...,15.9,17.3,0.2,0.2,0.4,0.036,-3.3,-1.5,-4.8,-0.4


In [8]:
url = 'https://hoopshype.com/salaries/players/'

salary = requests.get(url)

#create BeautifulSoup object
soup = BeautifulSoup(salary.content, 'html.parser')
print(soup.table)

#locate correct table
table = soup.find("table", class_="hh-salaries-ranking-table hh-salaries-table-sortable responsive")
rows = table.findAll(lambda tag: tag.name=='tr')

#create DataFrame
df_salary = pd.read_html(str(table))[0]


<table class="hh-salaries-ranking-table hh-salaries-table-sortable responsive">
<thead>
<tr class="table-index">
<td class="rank"></td>
<td class="name">Player</td>
<td class="hh-salaries-sorted">
							2022/23						</td>
<td class="">
							2023/24						</td>
<td class="">
							2024/25						</td>
<td class="">
							2025/26						</td>
<td class="">
							2026/27						</td>
<td class="">
							2027/28						</td>
</tr>
</thead>
<tbody>
<tr>
<td class="rank">
						1.
					</td>
<td class="name">
<a href="https://hoopshype.com/player/stephen-curry/salary/">
								Stephen Curry							</a>
</td>
<td class="hh-salaries-sorted" data-value="48070014" style="color:black">
							$48,070,014						</td>
<td class="" data-value="51915615" style="color:black">
							$51,915,615						</td>
<td class="" data-value="55761217" style="color:black">
							$55,761,217						</td>
<td class="" data-value="59606817" style="color:black">
							$59,606,817						</td>
<td class="" data-value="0" st

In [9]:
# drop junk column
df_salary = df_salary.drop(columns="Unnamed: 0")

# convert salaries to int
for column in df_salary.columns:
    if column != "Player":
        df_salary[column] = df_salary[column].str.replace('[$,]', '').astype(int)


  df_salary[column] = df_salary[column].str.replace('[$,]', '').astype(int)


In [10]:
df_salary

Unnamed: 0,Player,2022/23,2023/24,2024/25,2025/26,2026/27,2027/28
0,Stephen Curry,48070014,51915615,55761217,59606817,0,0
1,John Wall,47345760,6802950,0,0,0,0
2,Russell Westbrook,47063478,0,0,0,0,0
3,LeBron James,44474988,46698737,50434636,0,0,0
4,Kevin Durant,44119845,47649433,51179020,54708608,0,0
...,...,...,...,...,...,...,...
534,Demetrius Jackson,92857,92857,0,0,0,0
535,Olivier Sarr,90665,0,0,0,0,0
536,Quenton Jackson,50000,0,0,0,0,0
537,DJ Steward,50000,0,0,0,0,0


In [11]:
# merge player dataframes
df_player_salary = pd.merge(df_player, df_salary, on='Player')
df_player_salary

Unnamed: 0,Player,Pos,Age,MP,FG,FGA,FG%,3P,3PA,3P%,...,OBPM,DBPM,BPM,VORP,2022/23,2023/24,2024/25,2025/26,2026/27,2027/28
0,Precious Achiuwa,C,23,20.4,3.0,7.7,.391,0.4,2.3,.179,...,-2.1,-1.8,-3.9,-0.1,2840160,4379526,6275861,0,0,0
1,Steven Adams,C,29,26.5,3.5,6.0,.591,0.0,0.0,.000,...,-0.6,0.7,0.1,0.4,17926829,12600000,12600000,0,0,0
2,Bam Adebayo,C,25,35.3,8.3,15.5,.535,0.0,0.3,.100,...,-0.3,0.4,0.1,0.6,30351780,32600060,34848340,37096620,0,0
3,Ochai Agbaji,SG,22,7.9,1.1,2.6,.405,0.3,1.3,.238,...,-3.9,-2.7,-6.6,-0.1,3918360,4114200,4310280,6383525,8879483,0
4,Nickeil Alexander-Walker,SG,24,15.5,2.5,5.1,.489,1.1,2.7,.420,...,0.0,0.5,0.5,0.3,5009633,7073602,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,Christian Wood,PF,27,27.8,6.5,12.0,.538,1.7,4.4,.393,...,1.5,-0.1,1.4,0.8,14317459,0,0,0,0,0
414,Delon Wright,PG,30,19.9,1.9,4.7,.394,0.6,2.4,.235,...,0.3,6.4,6.8,0.3,7804878,8195122,0,0,0,0
415,Thaddeus Young,PF,34,17.2,2.2,4.1,.543,0.1,0.7,.182,...,-0.9,1.2,0.2,0.3,8000000,8000000,0,0,0,0
416,Trae Young,PG,24,35.6,8.6,20.9,.414,2.3,7.3,.316,...,5.4,-2.3,3.1,1.4,37096500,40064220,43031940,45999660,48967380,0


In [13]:
# write data to csv
df_player_salary.to_csv('player_per_game_salary_2023.csv')
df_player.to_csv('player_per_game_2023.csv')
df_salary.to_csv('player_salary_2023.csv')