In [38]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import sys
import io



 

In [19]:
start = 2023 
end = 2025 
seasons_list = [str(year) for year in range(start, end)]
seasons_list

['2023', '2024']

### Collect Roster Avg Stats Per Season

In [420]:
team_names = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHO': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

In [389]:
# team_names = {
#     'DET': 'Detroit Pistons',
#     'BOS': 'Boston Celtics',
#     'PHO': 'Phoenix Suns',

# }

In [139]:
# Convert height to inches (Ht format 'ft-in' to inches)
def height_to_inches(height):
    if pd.isna(height):
        return None
    feet, inches = map(int, height.split('-'))
    return feet * 12 + inches

In [183]:
def clean_roster(roster, team_names):

    roster['ht'] = roster['Ht'].apply(height_to_inches)
    
    roster['birth_year'] = pd.to_datetime(roster['Birth Date']).dt.year

    roster['age'] = roster['Season']- roster['birth_year'] -1

    roster['exp'] = roster['Exp'].replace('R', 0)

    roster['exp'] = roster['exp'].round().astype(int)

    roster['team'] = team_names

    roster.drop(columns=['College','No.','Player','Pos','Birth Date','Birth','birth_year'], inplace=True)

    return roster

In [225]:
# https://www.basketball-reference.com/teams/DAL/2025.html

def scrape_season(season,team_names):
    all_data = pd.DataFrame()

    for team in team_names:

        url = f"https://www.basketball-reference.com/teams/{team}/{season}.html"

        response = requests.get(url)

        print(response)
        print(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'roster'})

            if table:
                df = pd.read_html(io.StringIO(str(table)))[0]

                df['Season'] = season
                
                df = clean_roster(df,team)

                all_data = pd.concat([all_data, df], ignore_index=True)

            else:
            
                print(f"No table found for {season}")

        else:
            print(f"Failed to retrieve data for {season}")

        time.sleep(4)

    return all_data

In [185]:
roster = scrape_season(2025,team_names)

<Response [200]>
https://www.basketball-reference.com/teams/ATL/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/BOS/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/BRK/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/CHO/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/CHI/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/CLE/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/DAL/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/DEN/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/DET/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/GSW/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/HOU/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/IND/2025.html
<Response [200]>
https://www.basketball-reference.com/teams/LAC/2025.html
<Response [200]>
https://www.basketbal

In [186]:
roster.sample(5)

Unnamed: 0,Ht,Wt,Exp,Season,ht,age,exp,team
294,6-5,215.0,4,2025,77,26,4,MEM
212,6-11,256.0,3,2025,83,29,3,HOU
47,6-5,185.0,1,2025,77,20,1,BRK
94,6-7,221.0,7,2025,79,34,7,CHI
116,6-7,230.0,8,2025,79,31,8,CLE


In [187]:
roster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ht      589 non-null    object 
 1   Wt      493 non-null    float64
 2   Exp     589 non-null    object 
 3   Season  589 non-null    int64  
 4   ht      589 non-null    int64  
 5   age     589 non-null    int64  
 6   exp     589 non-null    int64  
 7   team    589 non-null    object 
dtypes: float64(1), int64(4), object(3)
memory usage: 36.9+ KB


In [188]:
def team_avg_roster(roster):

    avg_roster = roster.groupby(['team', 'Season']).agg(
        avg_age=('age', 'mean'),
        avg_exp=('exp', 'mean'),
        avg_weight=('Wt', 'mean'),
        avg_height=('ht', 'mean')
    ).reset_index()

    return avg_roster

In [189]:
avg_roster = team_avg_roster(roster)

In [196]:
avg_roster['team_full_name'] = avg_roster['team'].map(team_names)

avg_roster = avg_roster[['Season','team', 'team_full_name', 'avg_age', 'avg_exp', 'avg_weight', 'avg_height']]

In [198]:
avg_roster.sample(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height
9,2025,GSW,Golden State Warriors,26.714286,4.761905,213.0,78.238095
20,2025,OKC,Oklahoma City Thunder,24.0,2.8125,204.833333,78.5
28,2025,UTA,Utah Jazz,24.722222,3.777778,220.5,78.833333
29,2025,WAS,Washington Wizards,24.941176,3.941176,219.5,79.470588
0,2025,ATL,Atlanta Hawks,25.333333,3.52381,215.947368,79.0


### Append pre-season odds

In [126]:
# https://www.basketball-reference.com/leagues/NBA_2025_preseason_odds.html

def scrape_preseason_odds(season):
    all_data = pd.DataFrame()


    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_preseason_odds.html"

    response = requests.get(url)

    print(response)
    print(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'NBA_preseason_odds'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]

            all_data = pd.concat([all_data, df], ignore_index=True)

        else:
        
            print(f"No table found for {season}")

    else:
        print(f"Failed to retrieve data for {season}")

    return all_data

In [127]:
preseason_odds = scrape_preseason_odds(2025)

<Response [200]>
https://www.basketball-reference.com/leagues/NBA_2025_preseason_odds.html


In [205]:
avg_n_odds = pd.merge(avg_roster, preseason_odds, left_on='team_full_name', right_on='Team', how='left')

avg_n_odds.drop(columns='Team', inplace=True)


In [206]:
avg_n_odds.sample(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds
26,2025,SAS,San Antonio Spurs,24.3,3.8,207.0,78.55,10000
8,2025,DET,Detroit Pistons,24.444444,3.055556,209.4,78.555556,100000
24,2025,POR,Portland Trail Blazers,24.4,3.05,208.736842,79.3,100000
12,2025,LAC,Los Angeles Clippers,27.0,5.666667,216.470588,77.952381,3000
13,2025,LAL,Los Angeles Lakers,25.7,4.4,216.941176,79.4,3000


### Append salary stats

In [None]:
def clean_salaries(salaries):

    salaries.rename(columns={"Unnamed: 1":"player_name"}, inplace=True)
    
    salaries['Salary'] = salaries['Salary'].replace({'\$': '', ',': ''}, regex=True).astype(int)

    salaries = salaries.sort_values(by='Salary', ascending=False)

    salaries = salaries.drop(columns=['Rk'])

    return salaries

In [421]:
# https://www.basketball-reference.com/teams/CHO/2025.html

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def scrape_season(season,team_names):
    all_data = pd.DataFrame()

    driver = webdriver.Chrome() 

    for team in team_names:

        url = f"https://www.basketball-reference.com/teams/{team}/{season}.html"

        driver.get(url)

        time.sleep(5)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        div = soup.find('div', {'id': 'div_salaries2'})
        
        if div:
            table = div.find('table', {'id': 'salaries2'})
            if table:
                df = pd.read_html(str(table))[0]
                df['Season'] = season
                df['team']= team
                df = clean_salaries(df)
                all_data = pd.concat([all_data, df], ignore_index=True)
            else:
                print(f"No table found inside the div for {team} in {season}")
        else:
            print(f"No div with id 'div_salaries2' found for {team} in {season}")

    driver.quit()  # Close the browser session

    return all_data

In [422]:
salaries = scrape_season(2025, team_names)

  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(

In [423]:
salaries.sample(5)

Unnamed: 0,player_name,Salary,Season,team
59,Nick Smith Jr.,2587200,2025,CHO
326,Jared McCain,4020360,2025,PHI
339,E.J. Liddell,2120693,2025,PHO
246,Delon Wright,2087519,2025,MIL
136,Draymond Green,24107143,2025,GSW


#### Finding the number of top players per team per season

In [424]:
total_salary = salaries.groupby(['team', 'Season'])['Salary'].sum().reset_index(name='total_salary')

salaries = salaries.merge(total_salary, on=['team', 'Season'], how='left')

salaries['salary_percentage'] = (salaries['Salary'] / salaries['total_salary']) * 100

In [435]:
salaries[salaries['team']=='LAL'].sort_values('salary_percentage', ascending=False).head(20)

Unnamed: 0,player_name,Salary,Season,team,total_salary,salary_percentage
194,LeBron James,48728845,2025,LAL,187028846,26.054187
195,Anthony Davis,43219440,2025,LAL,187028846,23.108435
196,D'Angelo Russell,18692307,2025,LAL,187028846,9.994344
197,Rui Hachimura,17000000,2025,LAL,187028846,9.089507
198,Austin Reaves,12976362,2025,LAL,187028846,6.938161
199,Gabe Vincent,11000000,2025,LAL,187028846,5.881446
200,Jarred Vanderbilt,10714286,2025,LAL,187028846,5.728681
201,Max Christie,7142857,2025,LAL,187028846,3.819121
202,Jalen Hood-Schifino,3879840,2025,LAL,187028846,2.074461
203,Dalton Knecht,3819120,2025,LAL,187028846,2.041995


In [426]:
def count_top_players(df,threshold=20):
    # based on the teams salaries we are trying to identify the numbers of stars/top players on the team (a good example is the Phoenix Suns in 2025)

    above = df[df['salary_percentage'] > threshold].shape[0]
    below = df[df['salary_percentage'] <= threshold].shape[0]
    
    return pd.Series({'top_players': above, 'not_top_players': below})


In [427]:
salaries[salaries['team']=='PHO'].groupby(['team', 'Season']).apply(count_top_players).reset_index()

  salaries[salaries['team']=='PHO'].groupby(['team', 'Season']).apply(count_top_players).reset_index()


Unnamed: 0,team,Season,top_players,not_top_players
0,PHO,2025,3,10


In [428]:
top_players = salaries.groupby(['team', 'Season']).apply(count_top_players).reset_index()

  top_players = salaries.groupby(['team', 'Season']).apply(count_top_players).reset_index()


In [429]:
team_salary_stats = salaries.groupby(['team', 'Season']).agg(
    highest_salary=('Salary', 'max'),
    median_salary=('Salary', 'median'),
    total_salary=('Salary', 'sum'),
    
).reset_index()

In [430]:
team_salary_stats = pd.merge(team_salary_stats, top_players, on=['team', 'Season'], how="left")

In [431]:
team_salary_stats.sample(3)

Unnamed: 0,team,Season,highest_salary,median_salary,total_salary,top_players,not_top_players
10,HOU,2025,42846615,9249960.0,165026987,1,14
29,WAS,2025,29651786,6692959.0,158877689,0,16
18,NOP,2025,36725670,5159854.0,163702985,3,8


In [433]:
avg_odds_salary_players = pd.merge(avg_n_odds, team_salary_stats,on=['team', 'Season'], how='left')


In [437]:
avg_odds_salary_players.sample(10)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players
2,2025,BRK,Brooklyn Nets,24.947368,3.947368,214.111111,79.052632,100000,40338144,4041249.0,165717974,1,14
5,2025,CLE,Cleveland Cavaliers,25.75,4.2,208.263158,77.95,5000,36725670,8500000.0,161102772,2,11
15,2025,MIA,Miami Heat,26.7,5.05,218.75,78.0,4000,48798677,4655040.0,184058374,1,12
18,2025,NOP,New Orleans Pelicans,25.75,3.15,210.941176,78.4,5000,36725670,5159854.0,163702985,3,8
29,2025,WAS,Washington Wizards,24.941176,3.941176,219.5,79.470588,100000,29651786,6692959.0,158877689,0,16
17,2025,MIN,Minnesota Timberwolves,26.210526,4.578947,220.533333,79.263158,850,49205800,5287710.0,205560682,3,11
1,2025,BOS,Boston Celtics,26.318182,4.727273,220.315789,79.318182,300,49700000,3256341.5,201620656,1,15
25,2025,SAC,Sacramento Kings,26.0,4.611111,212.071429,78.388889,6600,40500000,8000000.0,167640684,2,11
14,2025,MEM,Memphis Grizzlies,25.05,3.15,213.266667,78.9,4000,36725670,4858705.5,177014145,1,15
20,2025,OKC,Oklahoma City Thunder,24.0,2.8125,204.833333,78.5,900,35859950,6669000.0,159241956,1,14


In [436]:
avg_odds_salary_players.to_csv('data/2025_team_data.csv')

### Other

In [None]:

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):  
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season
            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

result['make_playoffs'] = result['Team'].apply(lambda x: 1 if '*' in x else 0)
result['Team'] = result['Team'].str.replace('*', '')

result.head(10)


In [None]:
def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'totals-team'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
    else:
        print(f"Failed to retrieve data for {season}")

    time.sleep(4)

    return all_data

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):  
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season
            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

result['make_playoffs'] = result['Team'].apply(lambda x: 1 if '*' in x else 0)
result['Team'] = result['Team'].str.replace('*', '')

result.head(10)


In [21]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      60 non-null     float64
 1   Team    62 non-null     object 
 2   G       62 non-null     int64  
 3   MP      62 non-null     int64  
 4   FG      62 non-null     int64  
 5   FGA     62 non-null     int64  
 6   FG%     62 non-null     float64
 7   3P      62 non-null     int64  
 8   3PA     62 non-null     int64  
 9   3P%     62 non-null     float64
 10  2P      62 non-null     int64  
 11  2PA     62 non-null     int64  
 12  2P%     62 non-null     float64
 13  FT      62 non-null     int64  
 14  FTA     62 non-null     int64  
 15  FT%     62 non-null     float64
 16  ORB     62 non-null     int64  
 17  DRB     62 non-null     int64  
 18  TRB     62 non-null     int64  
 19  AST     62 non-null     int64  
 20  STL     62 non-null     int64  
 21  BLK     62 non-null     int64  
 22  TOV 

In [None]:
def scrape_season_table(season, table_id):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/awards/awards_{season}.html'

    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    try:
        table = driver.find_element(By.ID, table_id)
        table_html = table.get_attribute('outerHTML')
        driver.quit()

        df = pd.read_html(StringIO(table_html), header=[1])[0]

        df['Season'] = season
        df['award_type'] = table_id

        all_data = pd.concat([all_data, df], ignore_index=True)
    except Exception as e:
        print(f"No table found for {table_id} in {season}")
        print(e)
        driver.quit()
        return None

    time.sleep(5)

    return all_data

def scrape_all_seasons_tables(seasons, table_ids):
    all_data = pd.DataFrame()

    total_tasks = len(seasons) * len(table_ids)
    task_count = 0

    for season in reversed(seasons):
        for table_id in table_ids:
            data = scrape_season_table(season, table_id)

            if data is not None:
                all_data = pd.concat([all_data, data], ignore_index=True)

            task_count += 1
            completion_percentage = (task_count / total_tasks) * 100
            sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
            sys.stdout.flush()

    sys.stdout.flush()

    print("Columns before removing 'Unnamed':", all_data.columns)

    all_data.columns = all_data.columns.map(str)
    all_data = all_data.loc[:, ~all_data.columns.str.contains('^Unnamed')]

    print("Columns after removing 'Unnamed':", all_data.columns)

    if isinstance(all_data.columns, pd.MultiIndex):
        all_data.columns = [' '.join(col).strip() for col in all_data.columns.values]

    print("Columns after flattening MultiIndex:", all_data.columns)
    
    return all_data

seasons_list = [str(year) for year in range(start, end)]
table_ids = ['mvp', 'roy', 'dpoy', 'smoy', 'mip', 'clutch_poy', 'leading_all_nba', 'leading_all_defense', 'leading_all_rookie', 'coy']

result = scrape_all_seasons_tables(seasons_list, table_ids)

all_award_voting = result

#save
all_award_voting.to_csv('all_award_voting.csv', index=False, encoding="utf-8-sig")