In [1]:
#import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from difflib import SequenceMatcher


In [2]:
#request html content, both for regular season and playoffs
url_base_reg = 'https://www.basketball-reference.com/leagues/NBA_{}.html'
url_base_play = 'https://www.basketball-reference.com/playoffs/NBA_{}.html'
url_base_payroll = 'https://hoopshype.com/salaries/{}/'

years = [year for year in range(2013, 2023)]

teams = ["Boston Celtics", "Brooklyn Nets", "New York Knicks", "Philadelphia 76ers", "Toronto Raptors", "Chicago Bulls",
"Cleveland Cavaliers", "Detroit Pistons", "Indiana Pacers", "Milwaukee Bucks", "Atlanta Hawks", "Charlotte Hornets",
"Miami Heat", "Orlando Magic", "Washington Wizards", "Denver Nuggets", "Minnesota Timberwolves", "Oklahoma City Thunder",
"Portland Trail Blazers", "Utah Jazz", "Golden State Warriors", "Los Angeles Clippers", "Los Angeles Lakers", "Phoenix Suns",
"Sacramento Kings", "Dallas Mavericks", "Houston Rockets", "Memphis Grizzlies", "New Orleans Pelicans", "San Antonio Spurs", "Charlotte Bobcats", "New Orleans Hornets"]

df_main = pd.DataFrame()

cpi = pd.read_csv('data/cpi-u_2013-22.csv')
bri = pd.read_csv('data/total_league_revenues.csv')

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [3]:
for i in years:

    url_reg = url_base_reg.format(i)

    per_game = requests.get(url_reg)

    #create BeautifulSoup object
    soup = BeautifulSoup(per_game.content, 'html.parser')

    #locate correct table
    table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="per_game-team") 
    rows = table.findAll(lambda tag: tag.name=='tr')

    #create DataFrame
    df = pd.read_html(str(table))[0]

    #clean DataFrame
    df = df.drop(index=30)
    df['Team'] = df['Team'].str.replace('*', '')
    df.insert(2, "Year", i, True)
    df = df.drop(columns=['Rk', 'G', 'MP'])


    #add advanced stats to DataFrame
    table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="advanced-team") 
    rows = table.findAll(lambda tag: tag.name=='tr')
    df_adv = pd.read_html(str(table))[0]
    
    df_adv.columns = df_adv.columns.droplevel()
    df_adv = df_adv.drop(index=30)
    df_adv['Team'] = df_adv['Team'].str.replace('*', '')
    df_adv = df_adv.drop(columns=["Rk", "L", "PW", "PL", "Unnamed: 17_level_1", "Unnamed: 22_level_1", "Unnamed: 27_level_1", "Arena"])


    #add playoff wins to DataFrame
    url_play = url_base_play.format(i)
    playoff_wins = requests.get(url_play)
    soup = BeautifulSoup(playoff_wins.content, 'html.parser')
    table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="advanced-team")
    df_play = pd.read_html(str(table))[0]
    
    df_play.columns = df_play.columns.droplevel()
    df_play = df_play.drop(index=16)
    df_play = df_play.rename(columns={"Tm": "Team", "W" : "Playoff_W"})
    df_play = df_play[["Team", "Playoff_W"]]


    #add payroll data
    url_payroll = url_base_payroll.format(f'{i-1}-{i}')
    payrolls = requests.get(url_payroll)
    soup = BeautifulSoup(payrolls.content, 'html.parser')
    table = soup.find("table", class_="hh-salaries-ranking-table hh-salaries-table-sortable responsive")
    rows = table.findAll(lambda tag: tag.name=='tr')
    df_payroll = pd.read_html(str(table))[0]

    df_payroll = df_payroll.iloc[:, [1,2]]
    df_payroll[f'{i-1}/{i-2000}'] = df_payroll[f'{i-1}/{i-2000}'].str.replace('[$,]', '').astype(int)
    df_payroll = df_payroll.rename(columns={f'{i-1}/{i-2000}': 'Payroll'})

    team_replace = {row: team for row in df_payroll['Team'].to_list() for team in teams if similar(row, team)>=0.53}

    for index, team in team_replace.items():
        df_payroll.loc[df_payroll['Team'] == index, 'Team'] = team
            
    # adjust payroll for inflation, basketball-related-income
    df_payroll['Adjusted Payroll'] = df_payroll['Payroll'] / (cpi[cpi['Year'] == (i - 1)]['Annual'].values[0] 
    + bri[bri['Year'] == (i - 1)]['Index'].values[0])


    # merge all datasets
    df = pd.merge(df, df_adv, on='Team')
    df = pd.merge(df, df_play, on='Team')
    df = pd.merge(df, df_payroll, on='Team')

    df_main = pd.concat([df_main, df])
    df_main = df_main.reset_index(drop=True)


  df['Team'] = df['Team'].str.replace('*', '')
  df_adv['Team'] = df_adv['Team'].str.replace('*', '')
  df_payroll[f'{i-1}/{i-2000}'] = df_payroll[f'{i-1}/{i-2000}'].str.replace('[$,]', '').astype(int)
  df['Team'] = df['Team'].str.replace('*', '')
  df_adv['Team'] = df_adv['Team'].str.replace('*', '')
  df_payroll[f'{i-1}/{i-2000}'] = df_payroll[f'{i-1}/{i-2000}'].str.replace('[$,]', '').astype(int)
  df['Team'] = df['Team'].str.replace('*', '')
  df_adv['Team'] = df_adv['Team'].str.replace('*', '')
  df_payroll[f'{i-1}/{i-2000}'] = df_payroll[f'{i-1}/{i-2000}'].str.replace('[$,]', '').astype(int)
  df['Team'] = df['Team'].str.replace('*', '')
  df_adv['Team'] = df_adv['Team'].str.replace('*', '')
  df_payroll[f'{i-1}/{i-2000}'] = df_payroll[f'{i-1}/{i-2000}'].str.replace('[$,]', '').astype(int)
  df['Team'] = df['Team'].str.replace('*', '')
  df_adv['Team'] = df_adv['Team'].str.replace('*', '')
  df_payroll[f'{i-1}/{i-2000}'] = df_payroll[f'{i-1}/{i-2000}'].str.replace('[$,]', '').as

In [4]:
df_main

Unnamed: 0,Team,Year,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,...,FT/FGA,eFG%,TOV%,DRB%,FT/FGA.1,Attend.,Attend./G,Playoff_W,Payroll,Adjusted Payroll
0,Denver Nuggets,2013,40.7,85.2,0.478,6.4,18.5,0.343,34.4,66.6,...,0.216,0.493,14.3,71.8,0.193,730616.0,17820.0,2.0,63444254,172431.041284
1,Houston Rockets,2013,38.1,82.7,0.461,10.6,28.9,0.366,27.5,53.8,...,0.232,0.502,13.5,75.2,0.196,683564.0,16672.0,2.0,48048265,130587.276916
2,Oklahoma City Thunder,2013,38.1,79.3,0.481,7.3,19.4,0.377,30.8,60.0,...,0.280,0.469,13.5,73.4,0.197,746323.0,18203.0,5.0,68924100,187324.360888
3,San Antonio Spurs,2013,39.1,81.4,0.481,8.1,21.5,0.376,31.1,59.9,...,0.204,0.480,13.7,74.9,0.179,755700.0,18432.0,15.0,69838600,189809.821387
4,Miami Heat,2013,38.4,77.4,0.496,8.7,22.1,0.396,29.6,55.4,...,0.224,0.487,14.8,73.0,0.200,819290.0,19983.0,16.0,80427933,218589.885784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Golden State Warriors,2022,40.5,86.4,0.469,14.3,39.4,0.364,26.2,47.0,...,0.181,0.509,13.0,78.7,0.201,740624.0,18064.0,16.0,178980766,349607.695551
151,Miami Heat,2022,39.6,84.8,0.467,13.6,35.8,0.379,26.0,49.0,...,0.204,0.524,13.8,78.0,0.209,804761.0,19628.0,11.0,140840240,275106.833252
152,Philadelphia 76ers,2022,39.4,84.5,0.466,11.6,31.8,0.364,27.8,52.7,...,0.232,0.524,12.1,76.8,0.192,846867.0,20655.0,6.0,148922969,290895.033976
153,Toronto Raptors,2022,40.6,91.3,0.445,11.9,34.2,0.349,28.7,57.1,...,0.177,0.535,14.4,75.6,0.199,547343.0,13350.0,2.0,134896484,263496.743047


In [5]:
#write data to csv
df_main.to_csv('data/playoff_per_game_2013-22.csv')