In [38]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import sys

import io


In [19]:
start = 2023 
end = 2025 
seasons_list = [str(year) for year in range(start, end)]
seasons_list

['2023', '2024']

### Collect Roster Avg Stats Per Season

In [103]:
team_names = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHO': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

In [84]:
# team_names = {
#     'BOS': 'Boston Celtics',
#     'BRK': 'Brooklyn Nets',

# }

In [85]:
# Convert height to inches (Ht format 'ft-in' to inches)
def height_to_inches(height):
    if pd.isna(height):
        return None
    feet, inches = map(int, height.split('-'))
    return feet * 12 + inches

In [86]:
def clean_roster(roster, team):

    roster['Ht'] = roster['Ht'].apply(height_to_inches)
    
    roster['Birth Year'] = pd.to_datetime(roster['Birth Date']).dt.year

    roster['Age'] = roster['Season']- roster['Birth Year'] -1

    roster['Exp'] = roster['Exp'].replace('R', 0)

    roster['Exp'] = roster['Exp'].round().astype(int)

    roster['Team'] = team

    roster.drop(columns=['College','No.','Player','Pos','Birth Date','Birth','Birth Year'], inplace=True)

    return roster

In [87]:
# https://www.basketball-reference.com/teams/DAL/2025.html

def scrape_season(season,team_names):
    all_data = pd.DataFrame()

    for team in team_names:

        url = f"https://www.basketball-reference.com/teams/{team}/{season}.html"

        response = requests.get(url)

        print(response)
        print(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'roster'})

            if table:
                df = pd.read_html(io.StringIO(str(table)))[0]

                df['Season'] = season
                
                df = clean_roster(df,team)

                all_data = pd.concat([all_data, df], ignore_index=True)

            else:
            
                print(f"No table found for {season}")

        else:
            print(f"Failed to retrieve data for {season}")

        time.sleep(4)

    return all_data

In [None]:
roster = scrape_season(2025,team_names)

In [108]:
roster.sample(5)

Unnamed: 0,Ht,Wt,Exp,Season,Age,Team
242,80,214.0,6,2025,25,LAL
75,73,200.0,6,2025,29,CHI
156,80,250.0,4,2025,23,DET
220,82,206.0,3,2025,22,IND
440,73,195.0,6,2025,29,POR


In [109]:
roster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549 entries, 0 to 548
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ht      549 non-null    int64  
 1   Wt      459 non-null    float64
 2   Exp     549 non-null    int64  
 3   Season  549 non-null    int64  
 4   Age     549 non-null    int64  
 5   Team    549 non-null    object 
dtypes: float64(1), int64(4), object(1)
memory usage: 25.9+ KB


In [110]:
def team_avg_roster(roster):

    avg_roster = roster.groupby(['Team', 'Season']).agg(
        avg_age=('Age', 'mean'),
        avg_exp=('Exp', 'mean'),
        avg_weight=('Wt', 'mean'),
        avg_height=('Ht', 'mean')
    ).reset_index()

    return avg_roster

In [111]:
avg_roster = team_avg_roster(roster)

In [112]:
avg_roster

Unnamed: 0,Team,Season,avg_age,avg_exp,avg_weight,avg_height
0,ATL,2025,25.333333,3.52381,215.947368,79.0
1,BOS,2025,26.318182,4.727273,220.315789,79.318182
2,BRK,2025,24.947368,3.947368,214.111111,79.052632
3,CHI,2025,24.952381,3.619048,213.5,78.47619
4,CLE,2025,25.75,4.2,208.263158,77.95
5,DAL,2025,26.571429,5.0,216.263158,79.142857
6,DEN,2025,25.714286,4.047619,226.071429,79.285714
7,DET,2025,24.444444,3.055556,209.4,78.555556
8,GSW,2025,26.714286,4.761905,213.0,78.238095
9,HOU,2025,25.5,3.888889,220.666667,78.666667


In [None]:

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):  
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season
            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

result['make_playoffs'] = result['Team'].apply(lambda x: 1 if '*' in x else 0)
result['Team'] = result['Team'].str.replace('*', '')

result.head(10)


In [None]:
def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'totals-team'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
    else:
        print(f"Failed to retrieve data for {season}")

    time.sleep(4)

    return all_data

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):  
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season
            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

result['make_playoffs'] = result['Team'].apply(lambda x: 1 if '*' in x else 0)
result['Team'] = result['Team'].str.replace('*', '')

result.head(10)


In [21]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      60 non-null     float64
 1   Team    62 non-null     object 
 2   G       62 non-null     int64  
 3   MP      62 non-null     int64  
 4   FG      62 non-null     int64  
 5   FGA     62 non-null     int64  
 6   FG%     62 non-null     float64
 7   3P      62 non-null     int64  
 8   3PA     62 non-null     int64  
 9   3P%     62 non-null     float64
 10  2P      62 non-null     int64  
 11  2PA     62 non-null     int64  
 12  2P%     62 non-null     float64
 13  FT      62 non-null     int64  
 14  FTA     62 non-null     int64  
 15  FT%     62 non-null     float64
 16  ORB     62 non-null     int64  
 17  DRB     62 non-null     int64  
 18  TRB     62 non-null     int64  
 19  AST     62 non-null     int64  
 20  STL     62 non-null     int64  
 21  BLK     62 non-null     int64  
 22  TOV 

In [None]:
def scrape_season_table(season, table_id):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/awards/awards_{season}.html'

    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    try:
        table = driver.find_element(By.ID, table_id)
        table_html = table.get_attribute('outerHTML')
        driver.quit()

        df = pd.read_html(StringIO(table_html), header=[1])[0]

        df['Season'] = season
        df['award_type'] = table_id

        all_data = pd.concat([all_data, df], ignore_index=True)
    except Exception as e:
        print(f"No table found for {table_id} in {season}")
        print(e)
        driver.quit()
        return None

    time.sleep(5)

    return all_data

def scrape_all_seasons_tables(seasons, table_ids):
    all_data = pd.DataFrame()

    total_tasks = len(seasons) * len(table_ids)
    task_count = 0

    for season in reversed(seasons):
        for table_id in table_ids:
            data = scrape_season_table(season, table_id)

            if data is not None:
                all_data = pd.concat([all_data, data], ignore_index=True)

            task_count += 1
            completion_percentage = (task_count / total_tasks) * 100
            sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
            sys.stdout.flush()

    sys.stdout.flush()

    print("Columns before removing 'Unnamed':", all_data.columns)

    all_data.columns = all_data.columns.map(str)
    all_data = all_data.loc[:, ~all_data.columns.str.contains('^Unnamed')]

    print("Columns after removing 'Unnamed':", all_data.columns)

    if isinstance(all_data.columns, pd.MultiIndex):
        all_data.columns = [' '.join(col).strip() for col in all_data.columns.values]

    print("Columns after flattening MultiIndex:", all_data.columns)
    
    return all_data

seasons_list = [str(year) for year in range(start, end)]
table_ids = ['mvp', 'roy', 'dpoy', 'smoy', 'mip', 'clutch_poy', 'leading_all_nba', 'leading_all_defense', 'leading_all_rookie', 'coy']

result = scrape_all_seasons_tables(seasons_list, table_ids)

all_award_voting = result

#save
all_award_voting.to_csv('all_award_voting.csv', index=False, encoding="utf-8-sig")