# Spotrac Scraping

In [6]:
import requests
import pandas as pd
from pathlib import Path

# We're just testing, but let's practice using our config-style variables
SEASON = "2024-25"
TARGET_URL = f"https://www.spotrac.com/nba/contracts"

# This is the path we eventually want to write to
# (We need to go up two parent directories from `notebooks/Tyler/`)
PROJECT_ROOT = Path().resolve().parent.parent
RAW_SALARY_FILE = PROJECT_ROOT / "data" / "raw" / "raw_player_salaries.csv"

print(f"Project Root: {PROJECT_ROOT}")
print(f"Target URL: {TARGET_URL}")
print(f"Target File: {RAW_SALARY_FILE}")

Project Root: C:\Users\tyler\School\Learn Statistics\STA 160\Project
Target URL: https://www.spotrac.com/nba/contracts
Target File: C:\Users\tyler\School\Learn Statistics\STA 160\Project\data\raw\raw_player_salaries.csv


In [7]:
# Set a browser-like User-Agent so the site doesn't block us
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

try:
    response = requests.get(TARGET_URL, headers=headers)
    response.raise_for_status()  # This will raise an error if the request failed
    print("Success! Got the webpage.")
    html_content = response.text
except requests.exceptions.RequestException as e:
    print(f"Error fetching page: {e}")
    html_content = None

Success! Got the webpage.


In [8]:
# pd.read_html returns a LIST of all tables found on the page
# We need to inspect this list to find the one we want

if html_content:
    try:
        all_tables = pd.read_html(html_content)
        print(f"Found {len(all_tables)} tables on the page.")
        
        # This is a guess; we might need to look. 
        # Often the main table is the first one.
        salary_df_raw = all_tables[0] 
        
        print("--- Head of the first table found: ---")
        display(salary_df_raw.head())
        
    except ValueError as e:
        print(f"No tables found or error parsing: {e}")
        salary_df_raw = None
else:
    print("No HTML content to parse.")

Found 1 tables on the page.
--- Head of the first table found: ---


  all_tables = pd.read_html(html_content)


Unnamed: 0,RK,Player,Pos,Team Currently With,Age At Signing,Start,End,Yrs,Value,AAV
0,1,Jayson Tatum,PF,BOS BOS,26,2025,2029,5,"$313,933,410","$62,786,682"
1,2,Jaylen Brown,SF,BOS BOS,26,2024,2028,5,"$285,393,640","$57,078,728"
2,3,Nikola Jokic,C,DEN DEN,27,2023,2027,5,"$276,122,630","$55,224,526"
3,4,Shai Gilgeous-Alexander,PG,OKC OKC,26,2027,2030,4,"$273,302,400","$68,325,600"
4,T5,Cade Cunningham,PG,DET DET,22,2025,2029,5,"$269,085,780","$53,817,156"


In [None]:
if salary_df_raw is not None:
    try:
        
        final_df = salary_df_raw[['Player', 'AAV']].copy()
        
        # Rename them to match our contract
        final_df.rename(columns={
            'Player': 'Player_Name',
            'Base Salary': 'Salary'
        }, inplace=True)

        print("--- Cleaned and Finalized DataFrame: ---")
        display(final_df.head())

        print(f"size: {len(final_df)}")

    except KeyError as e:
        print(f"Error: A column name was wrong. You need to inspect the raw table.")
        print(f"Raw columns are: {salary_df_raw.columns}")
        final_df = None
else:
    print("Can't inspect, raw DataFrame is empty.")



--- Cleaned and Finalized DataFrame: ---


Unnamed: 0,Player_Name,AAV
0,Jayson Tatum,"$62,786,682"
1,Jaylen Brown,"$57,078,728"
2,Nikola Jokic,"$55,224,526"
3,Shai Gilgeous-Alexander,"$68,325,600"
4,Cade Cunningham,"$53,817,156"


size: 100


# Spotrac is not feasible. Let's try other sites.