In [2]:
import requests
import pandas as pd
from pathlib import Path
import io # To fix the FutureWarning

YEAR = "2024"
TARGET_URL = f"https://www.spotrac.com/nba/cap/_/year/{YEAR}"

PROJECT_ROOT = Path().resolve().parent.parent
RAW_SALARY_FILE = PROJECT_ROOT / "data" / "raw" / "raw_team_caps.csv"

print(f"Project Root: {PROJECT_ROOT}")
print(f"Target URL: {TARGET_URL}")
print(f"Target File: {RAW_SALARY_FILE}")

Project Root: /Users/macychen/VSCodeProjects/Analysis-of-NBA-Contract-Valuation-Bias
Target URL: https://www.spotrac.com/nba/cap/_/year/2024
Target File: /Users/macychen/VSCodeProjects/Analysis-of-NBA-Contract-Valuation-Bias/data/raw/raw_team_caps.csv


In [33]:
# Set browser header
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

all_teams_df = pd.DataFrame()

try:
    response = requests.get(TARGET_URL, headers=headers)
    response.raise_for_status()
        
    # Use io.StringIO to avoid the warning
    tables = pd.read_html(io.StringIO(response.text))
        
    if tables:
        # Directly assign the first table to the DataFrame
        all_teams_df = tables[0]
        
        # Rename columns if needed
        if len(all_teams_df.columns) == 10:
            all_teams_df.columns = ['rank', 'team', 'record', 'active_players', 'avg_team_age', 
                                   'total_cap_used', 'remaining_cap_space', 'active_cap', 
                                   'active_top_3', 'dead_cap']
    else:
        print(f"No tables found.")
            
except requests.exceptions.RequestException as e:
    print(f"Error fetching page: {e}")

all_teams_df

Unnamed: 0,rank,team,record,active_players,avg_team_age,total_cap_used,remaining_cap_space,active_cap,active_top_3,dead_cap
0,1.0,DET DET,44-38,15.0,25.0,"$141,808,220","$-1,220,220","$133,989,678","$56,559,037","$7,818,542"
1,2.0,ORL ORL,41-41,15.0,25.3,"$152,959,238","$-12,371,238","$152,959,238","$63,257,000",-
2,3.0,UTA UTA,17-65,15.0,23.9,"$154,114,022","$-13,526,022","$142,648,013","$86,906,400","$11,466,009"
3,4.0,SAS SAS,34-48,15.0,25.2,"$162,809,380","$-22,221,380","$155,231,927","$83,196,166","$7,577,453"
4,5.0,OKC OKC,68-14,15.0,24.2,"$166,001,694","$-25,413,694","$162,209,242","$82,859,950","$3,792,452"
5,6.0,CHA CHA,19-63,15.0,25.2,"$167,423,486","$-26,835,486","$152,436,862","$80,445,913","$14,986,624"
6,7.0,MEM MEM,48-34,15.0,24.6,"$167,814,933","$-27,226,933","$155,850,296","$95,988,718","$8,014,853"
7,8.0,CHI CHI,39-43,15.0,24.6,"$168,006,873","$-27,418,873","$157,180,244","$59,395,348","$8,739,110"
8,9.0,PHI PHI,24-58,15.0,26.6,"$170,340,647","$-29,752,647","$169,548,876","$135,768,738","$791,771"
9,10.0,IND IND,50-32,15.0,25.7,"$171,232,577","$-30,644,577","$166,839,774","$104,281,300","$2,530,538"


In [None]:
# Drop rank column
team_caps_df = all_teams_df.drop('rank', axis = 1)
        
# Clean team column - extract only the first part before space
team_caps_df['team'] = team_caps_df['team'].str.split().str[0]

# Drop last two rows
team_caps_df = team_caps_df.iloc[:-2]

team_caps_df

Unnamed: 0,team,record,active_players,avg_team_age,total_cap_used,remaining_cap_space,active_cap,active_top_3,dead_cap
0,DET,44-38,15.0,25.0,"$141,808,220","$-1,220,220","$133,989,678","$56,559,037","$7,818,542"
1,ORL,41-41,15.0,25.3,"$152,959,238","$-12,371,238","$152,959,238","$63,257,000",-
2,UTA,17-65,15.0,23.9,"$154,114,022","$-13,526,022","$142,648,013","$86,906,400","$11,466,009"
3,SAS,34-48,15.0,25.2,"$162,809,380","$-22,221,380","$155,231,927","$83,196,166","$7,577,453"
4,OKC,68-14,15.0,24.2,"$166,001,694","$-25,413,694","$162,209,242","$82,859,950","$3,792,452"
5,CHA,19-63,15.0,25.2,"$167,423,486","$-26,835,486","$152,436,862","$80,445,913","$14,986,624"
6,MEM,48-34,15.0,24.6,"$167,814,933","$-27,226,933","$155,850,296","$95,988,718","$8,014,853"
7,CHI,39-43,15.0,24.6,"$168,006,873","$-27,418,873","$157,180,244","$59,395,348","$8,739,110"
8,PHI,24-58,15.0,26.6,"$170,340,647","$-29,752,647","$169,548,876","$135,768,738","$791,771"
9,IND,50-32,15.0,25.7,"$171,232,577","$-30,644,577","$166,839,774","$104,281,300","$2,530,538"


In [5]:
df = pd.read_csv('/Users/macychen/VSCodeProjects/Analysis-of-NBA-Contract-Valuation-Bias/data/raw/Player_Performance_raw.csv')

df_unique = df.drop_duplicates(subset=['TEAM_ID'], keep='first')
df_unique

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,E_OFF_RATING,OFF_RATING,sp_work_OFF_RATING,E_DEF_RATING,DEF_RATING,sp_work_DEF_RATING,E_NET_RATING,...,EFG_PCT_RANK,TS_PCT_RANK,USG_PCT_RANK,E_USG_PCT_RANK,E_PACE_RANK,PACE_RANK,sp_work_PACE_RANK,PIE_RANK,FGM_PG_RANK,FGA_PG_RANK
0,2544,LeBron James,1610612747,113.3,112.7,112.7,113.0,114.0,114.0,0.3,...,153,132,21,20,345,333,333,9,7,21
1,101108,Chris Paul,1610612759,113.5,114.2,114.2,114.4,115.0,115.0,-0.9,...,232,227,442,444,310,346,346,186,252,237
2,200768,Kyle Lowry,1610612755,104.2,103.8,103.8,108.9,109.6,109.6,-4.8,...,451,409,549,549,436,439,439,435,462,423
3,200782,P.J. Tucker,1610612752,105.1,106.1,106.1,126.1,127.4,127.4,-21.0,...,32,43,566,565,554,565,565,553,476,482
4,201142,Kevin Durant,1610612756,116.0,116.5,116.5,116.7,117.4,117.4,-0.7,...,86,45,29,28,405,422,422,36,6,22
5,201143,Al Horford,1610612738,119.2,119.6,119.6,106.5,108.1,108.1,12.8,...,236,285,444,451,490,524,524,259,228,207
6,201144,Mike Conley,1610612750,114.5,115.5,115.5,108.0,109.0,109.0,6.5,...,277,246,429,429,482,515,515,253,293,262
7,201145,Jeff Green,1610612745,109.9,112.2,112.2,115.2,116.1,116.1,-5.3,...,49,38,409,419,238,347,347,307,374,395
8,201566,Russell Westbrook,1610612743,115.5,115.7,115.7,114.6,116.2,116.2,0.9,...,375,396,95,99,81,130,130,238,119,108
9,201567,Kevin Love,1610612748,108.9,108.7,108.7,112.7,112.3,112.3,-3.9,...,474,481,98,104,122,127,127,145,391,330


In [10]:
df = pd.read_csv('/Users/macychen/VSCodeProjects/Analysis-of-NBA-Contract-Valuation-Bias/data/raw/raw_salary_caps.csv')

ParserError: Error tokenizing data. C error: Expected 10 fields in line 4, saw 11


In [9]:
from nba_api.stats.static import teams
teams.get_teams()

[{'id': 1610612737,
  'full_name': 'Atlanta Hawks',
  'abbreviation': 'ATL',
  'nickname': 'Hawks',
  'city': 'Atlanta',
  'state': 'Georgia',
  'year_founded': 1949},
 {'id': 1610612738,
  'full_name': 'Boston Celtics',
  'abbreviation': 'BOS',
  'nickname': 'Celtics',
  'city': 'Boston',
  'state': 'Massachusetts',
  'year_founded': 1946},
 {'id': 1610612739,
  'full_name': 'Cleveland Cavaliers',
  'abbreviation': 'CLE',
  'nickname': 'Cavaliers',
  'city': 'Cleveland',
  'state': 'Ohio',
  'year_founded': 1970},
 {'id': 1610612740,
  'full_name': 'New Orleans Pelicans',
  'abbreviation': 'NOP',
  'nickname': 'Pelicans',
  'city': 'New Orleans',
  'state': 'Louisiana',
  'year_founded': 2002},
 {'id': 1610612741,
  'full_name': 'Chicago Bulls',
  'abbreviation': 'CHI',
  'nickname': 'Bulls',
  'city': 'Chicago',
  'state': 'Illinois',
  'year_founded': 1966},
 {'id': 1610612742,
  'full_name': 'Dallas Mavericks',
  'abbreviation': 'DAL',
  'nickname': 'Mavericks',
  'city': 'Dallas',