In [38]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import sys
import io

In [1061]:
start = 2010
end = 2014

end += 1

seasons_list = [year for year in range(start, end)]
seasons_list

[2010, 2011, 2012, 2013, 2014]

In [507]:
team_names = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHO': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

In [950]:
team_conferences = {
    'ATL': 'EAST',
    'BOS': 'EAST',
    'BRK': 'EAST',
    'CHO': 'EAST',
    'CHI': 'EAST',
    'CLE': 'EAST',
    'DAL': 'WEST',
    'DEN': 'WEST',
    'DET': 'EAST',
    'GSW': 'WEST',
    'HOU': 'WEST',
    'IND': 'EAST',
    'LAC': 'WEST',
    'LAL': 'WEST',
    'MEM': 'WEST',
    'MIA': 'EAST',
    'MIL': 'EAST',
    'MIN': 'WEST',
    'NOP': 'WEST',
    'NYK': 'EAST',
    'OKC': 'WEST',
    'ORL': 'EAST',
    'PHI': 'EAST',
    'PHO': 'WEST',
    'POR': 'WEST',
    'SAC': 'WEST',
    'SAS': 'WEST',
    'TOR': 'EAST',
    'UTA': 'WEST',
    'WAS': 'EAST'
}

In [489]:
# team_names = {
#     # 'DET': 'Detroit Pistons',
#     # 'BOS': 'Boston Celtics',
#     # 'PHO': 'Phoenix Suns',
#     'MEM': 'Memphis Grizzlies',

# }

### Collect Roster Average Stats Per Season

In [139]:
# Convert height to inches (Ht format 'ft-in' to inches)
def height_to_inches(height):
    if pd.isna(height):
        return None
    feet, inches = map(int, height.split('-'))
    return feet * 12 + inches

In [183]:
def clean_roster(roster, team_names):

    roster['ht'] = roster['Ht'].apply(height_to_inches)
    
    roster['birth_year'] = pd.to_datetime(roster['Birth Date']).dt.year

    roster['age'] = roster['Season']- roster['birth_year'] -1

    roster['exp'] = roster['Exp'].replace('R', 0)

    roster['exp'] = roster['exp'].round().astype(int)

    roster['team'] = team_names

    roster.drop(columns=['College','No.','Player','Pos','Birth Date','Birth','birth_year'], inplace=True)

    return roster

In [899]:
# https://www.basketball-reference.com/teams/DAL/2025.html

def scrape_roster(season,team_names):
    all_data = pd.DataFrame()

    for team in team_names:

        url = f"https://www.basketball-reference.com/teams/{team}/{season}.html"

        response = requests.get(url)

        # print(response)
        # print(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'roster'})

            if table:
                df = pd.read_html(io.StringIO(str(table)))[0]

                df['Season'] = season
                
                df = clean_roster(df,team)

                all_data = pd.concat([all_data, df], ignore_index=True)

            else:
            
                print(f"No table found for {season}")

        else:
            print(f"Failed to retrieve data for {season}")

        time.sleep(4)

    return all_data

In [None]:
roster = scrape_roster(2025,team_names)

In [186]:
roster.sample(5)

Unnamed: 0,Ht,Wt,Exp,Season,ht,age,exp,team
294,6-5,215.0,4,2025,77,26,4,MEM
212,6-11,256.0,3,2025,83,29,3,HOU
47,6-5,185.0,1,2025,77,20,1,BRK
94,6-7,221.0,7,2025,79,34,7,CHI
116,6-7,230.0,8,2025,79,31,8,CLE


In [474]:
def team_avg_roster(roster):

    avg_roster = roster.groupby(['team', 'Season']).agg(
        avg_age=('age', 'mean'),
        avg_exp=('exp', 'mean'),
        avg_weight=('Wt', 'mean'),
        avg_height=('ht', 'mean')
    ).reset_index()


    avg_roster['team_full_name'] = avg_roster['team'].map(team_names)

    avg_roster = avg_roster[['Season','team', 'team_full_name', 'avg_age', 'avg_exp', 'avg_weight', 'avg_height']]
    
    return avg_roster

In [196]:
avg_roster = team_avg_roster(roster)

In [198]:
avg_roster.sample(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height
9,2025,GSW,Golden State Warriors,26.714286,4.761905,213.0,78.238095
20,2025,OKC,Oklahoma City Thunder,24.0,2.8125,204.833333,78.5
28,2025,UTA,Utah Jazz,24.722222,3.777778,220.5,78.833333
29,2025,WAS,Washington Wizards,24.941176,3.941176,219.5,79.470588
0,2025,ATL,Atlanta Hawks,25.333333,3.52381,215.947368,79.0


### Append pre-season odds

In [876]:
# https://www.basketball-reference.com/leagues/NBA_2025_preseason_odds.html

def scrape_preseason_odds(season):
    all_data = pd.DataFrame()


    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_preseason_odds.html"

    response = requests.get(url)

    # print(response)
    # print(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'NBA_preseason_odds'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]

            all_data = pd.concat([all_data, df], ignore_index=True)

        else:
        
            print(f"No table found for {season}")

    else:
        print(f"Failed to retrieve data for {season}")

    return all_data

In [None]:
preseason_odds = scrape_preseason_odds(2025)

In [205]:
avg_n_odds = pd.merge(avg_roster, preseason_odds, left_on='team_full_name', right_on='Team', how='left')

avg_n_odds.drop(columns='Team', inplace=True)


In [206]:
avg_n_odds.sample(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds
26,2025,SAS,San Antonio Spurs,24.3,3.8,207.0,78.55,10000
8,2025,DET,Detroit Pistons,24.444444,3.055556,209.4,78.555556,100000
24,2025,POR,Portland Trail Blazers,24.4,3.05,208.736842,79.3,100000
12,2025,LAC,Los Angeles Clippers,27.0,5.666667,216.470588,77.952381,3000
13,2025,LAL,Los Angeles Lakers,25.7,4.4,216.941176,79.4,3000


### Append salary stats

In [969]:
def clean_salaries(salaries):
    
    salaries = salaries.head(25).copy()

    salaries.rename(columns={"Unnamed: 1":"player_name"}, inplace=True)
    
    salaries.loc[:, 'Salary'] = salaries['Salary'].fillna(0)

    salaries.loc[:, 'Salary'] = salaries['Salary'].replace({r'\$': '', ',': ''}, regex=True).astype(int)

    salaries = salaries.sort_values(by='Salary', ascending=False)

    salaries = salaries.drop(columns=['Rk'])

    return salaries

In [491]:
# https://www.basketball-reference.com/teams/CHO/2025.html

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def scrape_salaries(
        season,
        team_names,
        driver,
    ):

    all_data = pd.DataFrame()

    for team in team_names:

        url = f"https://www.basketball-reference.com/teams/{team}/{season}.html"

        driver.get(url)

        time.sleep(5)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        div = soup.find('div', {'id': 'div_salaries2'})
        
        if div:
            table = div.find('table', {'id': 'salaries2'})
            if table:
                df = pd.read_html(io.StringIO(str(table)))[0]
                df['Season'] = season
                df['team']= team
                df = clean_salaries(df)
                all_data = pd.concat([all_data, df], ignore_index=True)
            else:
                print(f"No table found inside the div for {team} in {season}")
        else:
            print(f"No div with id 'div_salaries2' found for {team} in {season}")
            
    return all_data

In [None]:
driver = webdriver.Chrome() 

salaries = scrape_salaries(
    2024, 
    team_names,
    driver,
)

driver.quit()

#### Finding the number of top players per team per season

In [497]:
def count_top_players(salaries,threshold=20):
    # based on the teams salaries we are trying to identify the numbers of stars/top players on the team (a good example is the Phoenix Suns in 2025)

    above = salaries[salaries['salary_percentage'] > threshold].shape[0]
    below = salaries[salaries['salary_percentage'] <= threshold].shape[0]
    
    return pd.Series({'top_players': above, 'not_top_players': below})

In [498]:
def find_top_players(salaries):
    total_salary = salaries.groupby(['team', 'Season'])['Salary'].sum().reset_index(name='total_salary')

    salaries = salaries.merge(total_salary, on=['team', 'Season'], how='left')

    salaries['salary_percentage'] = (salaries['Salary'] / salaries['total_salary']) * 100

    top_players = salaries.groupby(['team', 'Season']).apply(count_top_players).reset_index()
    
    return top_players

In [435]:
salaries[salaries['team']=='LAL'].sort_values('salary_percentage', ascending=False).head(20)

Unnamed: 0,player_name,Salary,Season,team,total_salary,salary_percentage
194,LeBron James,48728845,2025,LAL,187028846,26.054187
195,Anthony Davis,43219440,2025,LAL,187028846,23.108435
196,D'Angelo Russell,18692307,2025,LAL,187028846,9.994344
197,Rui Hachimura,17000000,2025,LAL,187028846,9.089507
198,Austin Reaves,12976362,2025,LAL,187028846,6.938161
199,Gabe Vincent,11000000,2025,LAL,187028846,5.881446
200,Jarred Vanderbilt,10714286,2025,LAL,187028846,5.728681
201,Max Christie,7142857,2025,LAL,187028846,3.819121
202,Jalen Hood-Schifino,3879840,2025,LAL,187028846,2.074461
203,Dalton Knecht,3819120,2025,LAL,187028846,2.041995


In [430]:
team_salary_stats = salaries.groupby(['team', 'Season']).agg(
    highest_salary=('Salary', 'max'),
    median_salary=('Salary', 'median'),
    total_salary=('Salary', 'sum'),
    
).reset_index()

top_players = find_top_players(salaries)

team_salary_stats = pd.merge(team_salary_stats, top_players, on=['team', 'Season'], how="left")

In [431]:
team_salary_stats.sample(3)

Unnamed: 0,team,Season,highest_salary,median_salary,total_salary,top_players,not_top_players
10,HOU,2025,42846615,9249960.0,165026987,1,14
29,WAS,2025,29651786,6692959.0,158877689,0,16
18,NOP,2025,36725670,5159854.0,163702985,3,8


In [433]:
avg_odds_salary_players = pd.merge(avg_n_odds, team_salary_stats,on=['team', 'Season'], how='left')


In [437]:
avg_odds_salary_players.sample(10)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players
2,2025,BRK,Brooklyn Nets,24.947368,3.947368,214.111111,79.052632,100000,40338144,4041249.0,165717974,1,14
5,2025,CLE,Cleveland Cavaliers,25.75,4.2,208.263158,77.95,5000,36725670,8500000.0,161102772,2,11
15,2025,MIA,Miami Heat,26.7,5.05,218.75,78.0,4000,48798677,4655040.0,184058374,1,12
18,2025,NOP,New Orleans Pelicans,25.75,3.15,210.941176,78.4,5000,36725670,5159854.0,163702985,3,8
29,2025,WAS,Washington Wizards,24.941176,3.941176,219.5,79.470588,100000,29651786,6692959.0,158877689,0,16
17,2025,MIN,Minnesota Timberwolves,26.210526,4.578947,220.533333,79.263158,850,49205800,5287710.0,205560682,3,11
1,2025,BOS,Boston Celtics,26.318182,4.727273,220.315789,79.318182,300,49700000,3256341.5,201620656,1,15
25,2025,SAC,Sacramento Kings,26.0,4.611111,212.071429,78.388889,6600,40500000,8000000.0,167640684,2,11
14,2025,MEM,Memphis Grizzlies,25.05,3.15,213.266667,78.9,4000,36725670,4858705.5,177014145,1,15
20,2025,OKC,Oklahoma City Thunder,24.0,2.8125,204.833333,78.5,900,35859950,6669000.0,159241956,1,14


In [436]:
avg_odds_salary_players.to_csv('data/2025_team_data.csv')

### Append Franchise History/Championship stats

In [606]:
def clean_champions(df):

    nba_champions = df[[('Unnamed: 0_level_0', 'Year'), ('Finals', 'Champion')]]
    nba_champions.columns = ['Year', 'Champion']

    nba_champions = nba_champions[nba_champions.Year.notna()]

    nba_champions.Year = nba_champions.Year.astype(int)

    return nba_champions

In [607]:
# https://www.basketball-reference.com/playoffs/

def scrape_champions():
    df = pd.DataFrame()

    url = f"https://www.basketball-reference.com/playoffs/"

    response = requests.get(url)

    # print(response)
    # print(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'champions_index'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]
            df = clean_champions(df)
        else:
        
            print(f"No table found ")

    else:
        print(f"Failed to retrieve data ")

    return df

In [1023]:
# Count number of championships for a given team and season

def nb_championships(team_names, season):

    nba_champions = scrape_champions()

    rev_team_names = {v: k for k, v in team_names.items()}

    nba_champions['team'] = nba_champions['Champion'].map(rev_team_names)
    
    team_championships = []
    
    for team in team_names:
        nb_total = len(nba_champions[
            (nba_champions['team'] == team) & 
            (nba_champions['Year'] <= season)
            ]
        )
        
        nb_interval = len(nba_champions[
            (nba_champions['team'] == team) & 
            (nba_champions['Year'] <= season) & 
            (nba_champions['Year'] > season-4)
            ]
        )
        
        team_championships.append({
            'team': team,
            'Season': season,
            'nb_championships': nb_total,
            'nb_champ_past_4y': nb_interval,
            'winner': len(nba_champions[
                (nba_champions['team'] == team) & 
                (nba_champions['Year'] == season)
                ]
            ),

        })
    
    df = pd.DataFrame(team_championships)
    
    return df


In [1027]:
nb_championships(team_names,2020).sample(5)

Unnamed: 0,team,Season,nb_championships,nb_champ_past_4y,winner
27,TOR,2020,1,1,0
9,GSW,2020,4,2,0
24,POR,2020,1,0,0
0,ATL,2020,0,0,0
7,DEN,2020,0,0,0


In [716]:
team_championships = nb_championships(team_names,2025)

In [717]:

avg_odds_salary_players_champ = avg_odds_salary_players.merge(team_championships, on=['team', 'Season'], how='left')

In [718]:
avg_odds_salary_players_champ.tail(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players,nb_championships,winner
25,2025,SAC,Sacramento Kings,26.0,4.611111,212.071429,78.388889,6600,40500000,8000000.0,167640684,2,11,0,0
26,2025,SAS,San Antonio Spurs,24.3,3.8,207.0,78.55,10000,29347826,9104643.5,142918889,1,13,5,0
27,2025,TOR,Toronto Raptors,25.368421,3.631579,210.615385,78.210526,50000,32500000,10130980.0,155718137,1,12,1,0
28,2025,UTA,Utah Jazz,24.722222,3.777778,220.5,78.833333,50000,42176400,5469120.0,128678582,2,9,0,0
29,2025,WAS,Washington Wizards,24.941176,3.941176,219.5,79.470588,100000,29651786,6692959.0,158877689,0,16,0,0


### Append ranking


In [1037]:
def clean_ranking(df):
    df = df.sort_values(by='W/L%', ascending=False)
    
    df.rename(columns={df.columns[0]: 'team_full_name'}, inplace=True)

    df = df[~df['team_full_name'].str.contains('Division|Conference', na=False)]

    df[df.columns[0]] = df[df.columns[0]].replace({r'\*': ''}, regex=True)
    
    df = df.reset_index(drop=True)

    df['ranking'] = df.index + 1

    df['ranking'] = df['ranking'].astype(int)

    df = df[['Season',df.columns[0], 'conference','ranking']]
    return df


In [996]:
# https://www.basketball-reference.com/leagues/NBA_2024_standings.html

def scrape_ranking(season):
    all_data = pd.DataFrame()

    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html"

    response = requests.get(url)

    # print(response)
    # print(url)

    for conf in ['W','E']:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': f"confs_standings_{conf}"})

            if table:
                df = pd.read_html(io.StringIO(str(table)))[0]
                
                df['Season'] = season
                if conf == 'W':
                    df["conference"] = "WEST" 
                else:
                    df['conference'] = "EAST"
                    
                df = clean_ranking(df)

                all_data = pd.concat([all_data, df], ignore_index=True)

            else:
                print(f"No table found for {season}. Looking in Division Standings...")

                soup = BeautifulSoup(response.content, 'html.parser')
                table = soup.find('table', {'id': f"divs_standings_{conf}"})

                if table:
                    df = pd.read_html(io.StringIO(str(table)))[0]
                    
                    df['Season'] = season
                    if conf == 'W':
                        df["conference"] = "WEST" 
                    else:
                        df['conference'] = "EAST"
                    
                    df = clean_ranking(df)

                    all_data = pd.concat([all_data, df], ignore_index=True)
                else:
                    print(f"No table found for {season} in Division Standings.")

        else:
            print(f"Failed to retrieve data for {season}")

    return all_data

In [947]:
ranking = scrape_ranking(2024)

In [948]:
avg_odds_salary_players_champ_rk = avg_odds_salary_players_champ.merge(ranking, on=['team_full_name', 'Season'], how='left')

In [949]:
avg_odds_salary_players_champ_rk.head(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players,nb_championships,winner,conference,ranking
0,2025,ATL,Atlanta Hawks,25.333333,3.52381,215.947368,79.0,15000,43031940,6059520.0,169529070,1,14,0,0,,
1,2025,BOS,Boston Celtics,26.318182,4.727273,220.315789,79.318182,300,49700000,3256341.5,201620656,1,15,18,0,,
2,2025,BRK,Brooklyn Nets,24.947368,3.947368,214.111111,79.052632,100000,40338144,4041249.0,165717974,1,14,0,0,,
3,2025,CHI,Chicago Bulls,24.952381,3.619048,213.5,78.47619,50000,43031940,7000000.0,166079908,1,14,6,0,,
4,2025,CHO,Charlotte Hornets,25.5,3.4,210.722222,78.7,100000,35147000,5250000.0,153709214,1,16,0,0,,


### Loop for multiple seasons :o

In [1062]:
def scrape_all_rosters():
    all_rosters = pd.DataFrame()

    for season in range(start, end):  
        print(f"Scraping data for the {season-1}-{season} season...")

        roster = scrape_roster(season,team_names)

        if roster is not None:
            all_rosters = pd.concat([all_rosters, roster], ignore_index=True)

    all_avg_roster = team_avg_roster(all_rosters)
    
    return all_avg_roster

all_avg_roster = scrape_all_rosters()

Scraping data for the 2009-2010 season...
Failed to retrieve data for 2010
Failed to retrieve data for 2010


In [961]:
all_avg_roster.to_csv(f"data/temp/{start}_{end-1}_avg_roster.csv")

In [964]:
def scrape_all_preseason_odds():
    all_preseason_odds = pd.DataFrame()

    for season in range(start, end):  
        print(f"Scraping data for the {season-1}-{season} season...")

        preseason_odds = scrape_preseason_odds(season)

        if preseason_odds is not None:
            all_preseason_odds = pd.concat([all_preseason_odds, preseason_odds], ignore_index=True)

    return all_preseason_odds

all_preseason_odds = scrape_all_preseason_odds()

all_avg_n_odds = pd.merge(all_avg_roster, preseason_odds, left_on='team_full_name', right_on='Team', how='left')

all_avg_n_odds.drop(columns='Team', inplace=True)


Scraping data for the 2014-2015 season...
Scraping data for the 2015-2016 season...
Scraping data for the 2016-2017 season...
Scraping data for the 2017-2018 season...
Scraping data for the 2018-2019 season...


In [965]:
all_avg_n_odds.to_csv(f"data/temp/{start}_{end-1}_avg_n_odds.csv")

In [966]:
def scrape_all_salaries():
    all_salaries = pd.DataFrame()

    driver = webdriver.Chrome() 

    for season in range(start, end):  
        print(f"Scraping data for the {season-1}-{season} season...")

        salaries = scrape_salaries(
            season,
            team_names,
            driver,
        )

        if salaries is not None:
            all_salaries = pd.concat([all_salaries, salaries], ignore_index=True)

    driver.quit()
    
    return all_salaries

In [None]:
all_salaries = scrape_all_salaries()

In [971]:
all_salaries.to_csv(f"data/temp/{start}_{end-1}_salaries_stats.csv")

In [972]:
all_top_players = find_top_players(all_salaries)

team_salary_stats = all_salaries.groupby(['team', 'Season']).agg(
    highest_salary=('Salary', 'max'),
    median_salary=('Salary', 'median'),
    total_salary=('Salary', 'sum'),
    
).reset_index()

all_team_salary_stats = pd.merge(team_salary_stats, all_top_players, on=['team', 'Season'], how="left")

  top_players = salaries.groupby(['team', 'Season']).apply(count_top_players).reset_index()


In [973]:
all_team_salary_stats.sample(5)

Unnamed: 0,team,Season,highest_salary,median_salary,total_salary,top_players,not_top_players
117,PHO,2017,14000000,2223600.0,85115778,0,21
61,LAC,2016,21468696,1215322.0,95585714,2,16
95,NYK,2015,23410988,1058622.0,83022165,2,18
98,NYK,2018,20566802,2274670.0,104405157,0,22
45,GSW,2015,15012000,1145685.0,73623654,1,18


In [974]:
all_avg_odds_salary_players = pd.merge(all_avg_n_odds, all_team_salary_stats,on=['team', 'Season'], how='left')

In [975]:
all_avg_odds_salary_players.sample(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players
32,2017,DAL,Dallas Mavericks,26.75,3.833333,213.041667,78.041667,1000,25000000,680937.0,103974075,2,23
128,2018,SAC,Sacramento Kings,25.666667,4.0,222.166667,79.166667,6600,12307692,2365560.0,95682824,0,23
107,2017,ORL,Orlando Magic,25.473684,3.631579,219.368421,78.684211,8000,17000000,4130580.0,104110336,0,18
38,2018,DEN,Denver Nuggets,25.777778,4.666667,220.722222,78.666667,750,31269231,2952687.5,107015203,1,17
90,2015,NOP,New Orleans Pelicans,25.47619,3.666667,213.714286,78.142857,5000,14898938,981084.0,69390028,1,18


In [976]:
def all_nba_championships(team_names,seasons_list):
    
    all_team_championships = pd.DataFrame()

    for season in seasons_list:
        team_championships = nb_championships(team_names,season)
        
        all_team_championships = pd.concat([all_team_championships, team_championships], ignore_index=True)

    return all_team_championships

In [977]:
all_team_championships = all_nba_championships(team_names,seasons_list)

In [978]:
all_avg_odds_salary_players_champ = all_avg_odds_salary_players.merge(all_team_championships, on=['team', 'Season'], how='left')

In [979]:
all_avg_odds_salary_players_champ[all_avg_odds_salary_players_champ.team=='BOS']

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players,nb_championships,winner
5,2015,BOS,Boston Celtics,26.136364,4.545455,218.454545,78.227273,300,10105855,2075760.0,61092622,0,21,17,0
6,2016,BOS,Boston Celtics,24.588235,3.058824,221.823529,78.294118,300,15493680,2107778.5,77202316,1,21,17,0
7,2017,BOS,Boston Celtics,25.133333,4.2,219.533333,77.866667,300,26540100,2500227.0,93465328,1,19,17,0
8,2018,BOS,Boston Celtics,25.3,2.7,219.85,77.6,300,29727900,1988520.0,115075693,2,19,17,0
9,2019,BOS,Boston Celtics,25.588235,3.764706,225.176471,78.470588,300,31214295,2667600.0,125541941,2,17,17,0


In [980]:
all_avg_odds_salary_players_champ.sample(5)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players,nb_championships,winner
58,2018,IND,Indiana Pacers,25.684211,4.0,223.684211,78.578947,5000,21000000,1524305.0,93669385,1,22,3,0
80,2015,MIL,Milwaukee Bucks,24.85,3.55,220.4,78.8,1100,11000000,1873200.0,61443341,0,21,1,0
108,2018,ORL,Orlando Magic,25.210526,3.736842,217.736842,78.526316,8000,17000000,4078320.0,95538311,0,19,0,0
117,2017,PHO,Phoenix Suns,25.555556,4.333333,218.388889,77.888889,3500,14000000,2223600.0,85115778,0,21,0,0
26,2016,CLE,Cleveland Cavaliers,28.833333,6.777778,228.944444,79.0,5000,22971000,4950000.0,105962520,1,14,1,1


In [981]:
all_avg_odds_salary_players_champ.to_csv(f"data/temp/{start}_{end-1}_avg_odds_salary_players_champ.csv")

In [982]:
def all_scrape_ranking(seasons_list):
    
    all_ranking = pd.DataFrame()

    for season in seasons_list:
        ranking = scrape_ranking(season)
        all_ranking = pd.concat([all_ranking, ranking], ignore_index=True)

    return all_ranking

In [1000]:
all_ranking = all_scrape_ranking(seasons_list)

No table found for 2015. Looking in Division Standings...
No table found for 2015. Looking in Division Standings...


In [1002]:
all_avg_odds_salary_players_champ_rk = all_avg_odds_salary_players_champ.merge(all_ranking, on=['team_full_name', 'Season'], how='left')

In [1005]:
all_avg_odds_salary_players_champ_rk.head(20)

Unnamed: 0,Season,team,team_full_name,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players,nb_championships,winner,conference,ranking
0,2015,ATL,Atlanta Hawks,27.0,4.75,224.25,78.75,15000,12000000,2000000.0,58337671,1,16,0,0,EAST,1
1,2016,ATL,Atlanta Hawks,27.352941,5.294118,218.0,78.588235,15000,19000000,2435000.0,71453126,1,15,0,0,EAST,4
2,2017,ATL,Atlanta Hawks,28.2,5.9,222.05,78.8,15000,23180275,2299942.5,96007250,2,18,0,0,EAST,5
3,2018,ATL,Atlanta Hawks,25.5,2.681818,208.181818,78.363636,15000,16910113,1662500.0,99574321,0,25,0,0,EAST,15
4,2019,ATL,Atlanta Hawks,25.0,3.181818,219.136364,79.136364,15000,25534253,2275020.0,107076435,1,23,0,0,EAST,12
5,2015,BOS,Boston Celtics,26.136364,4.545455,218.454545,78.227273,300,10105855,2075760.0,61092622,0,21,17,0,EAST,7
6,2016,BOS,Boston Celtics,24.588235,3.058824,221.823529,78.294118,300,15493680,2107778.5,77202316,1,21,17,0,EAST,5
7,2017,BOS,Boston Celtics,25.133333,4.2,219.533333,77.866667,300,26540100,2500227.0,93465328,1,19,17,0,EAST,1
8,2018,BOS,Boston Celtics,25.3,2.7,219.85,77.6,300,29727900,1988520.0,115075693,2,19,17,0,EAST,2
9,2019,BOS,Boston Celtics,25.588235,3.764706,225.176471,78.470588,300,31214295,2667600.0,125541941,2,17,17,0,EAST,4


In [1006]:
all_avg_odds_salary_players_champ_rk.to_csv(f"data/temp/{start}_{end-1}_avg_odds_salary_players_champ_rk.csv")

#### Concatenate datasets

In [1008]:
df_2020_2025 = pd.read_csv('data/2020_2025_avg_odds_salary_players_champ_rk.csv', index_col=False)
df_2015_2019 = pd.read_csv('data/2015_2019_avg_odds_salary_players_champ_rk.csv',  index_col=False)

df_2020_2025 = df_2020_2025.iloc[:, 1:]
df_2015_2019 = df_2015_2019.iloc[:, 1:]

In [1015]:
df_2020_2025.Season.unique()


array([2023, 2024, 2025, 2020, 2021, 2022])

In [1016]:
df_2015_2019.Season.unique()

array([2015, 2016, 2017, 2018, 2019])

In [1017]:
final_df = pd.concat([df_2020_2025, df_2015_2019], ignore_index=True)

In [1046]:
# final_df['ranking'] = final_df['ranking'].astype("Int64")

In [1048]:
final_df.ranking.unique()

<IntegerArray>
[8, 10, 2, 1, 6, 11, 9, 14, 13, 4, 5, 15, 7, 3, 12, <NA>]
Length: 16, dtype: Int64

In [1059]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Season            330 non-null    int64  
 1   team              330 non-null    object 
 2   team_full_name    330 non-null    object 
 3   conference        330 non-null    object 
 4   avg_age           330 non-null    float64
 5   avg_exp           330 non-null    float64
 6   avg_weight        330 non-null    float64
 7   avg_height        330 non-null    float64
 8   Odds              330 non-null    int64  
 9   highest_salary    330 non-null    int64  
 10  median_salary     330 non-null    float64
 11  total_salary      330 non-null    int64  
 12  top_players       330 non-null    int64  
 13  not_top_players   330 non-null    int64  
 14  nb_championships  330 non-null    int64  
 15  nb_champ_past_4y  330 non-null    int64  
 16  winner            330 non-null    int64  
 1

In [1058]:
final_df

Unnamed: 0,Season,team,team_full_name,conference,avg_age,avg_exp,avg_weight,avg_height,Odds,highest_salary,median_salary,total_salary,top_players,not_top_players,nb_championships,nb_champ_past_4y,winner,ranking
0,2023,ATL,Atlanta Hawks,EAST,24.700000,3.350000,211.050000,78.500000,15000,37096500,2755015.5,148633740,1,19,0,0,0,8
1,2024,ATL,Atlanta Hawks,EAST,25.789474,4.368421,210.631579,78.263158,15000,40064220,2753441.0,157259361,1,21,0,0,0,10
2,2023,BOS,Boston Celtics,EAST,26.722222,5.333333,225.388889,78.944444,300,30351780,3903140.5,175733265,0,18,17,0,0,2
3,2024,BOS,Boston Celtics,EAST,25.894737,4.684211,216.631579,79.263158,300,36861707,2019706.0,183679244,1,20,18,1,1,1
4,2023,BRK,Brooklyn Nets,EAST,27.000000,5.120000,218.560000,78.800000,100000,35448672,2138160.0,157516641,1,22,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,2015,WAS,Washington Wizards,EAST,28.500000,7.111111,222.000000,78.388889,100000,14746000,4099346.0,83108421,0,18,0,0,0,5
326,2016,WAS,Washington Wizards,EAST,27.947368,6.263158,224.473684,79.105263,100000,15851950,3650000.0,84237317,0,18,0,0,0,10
327,2017,WAS,Washington Wizards,EAST,25.666667,3.777778,217.555556,79.000000,100000,22116750,3128705.5,102334382,1,17,0,0,0,4
328,2018,WAS,Washington Wizards,EAST,27.312500,5.562500,213.562500,78.500000,100000,24773250,2046520.0,124030914,0,20,0,0,0,8


In [1020]:
final_df.to_csv(f"data/2015_2025_avg_odds_salary_players_champ_rk.csv")

### Append Conference field to Final Df

In [957]:
# final_df['conference'] = final_df['team'].map(team_conferences)

# final_df.insert(3, 'conference', final_df.pop('conference'))

# final_df.to_csv(f"data/2020_2025_avg_odds_salary_players_champ_rk.csv")

### Append nb_champ_past_4y field to Final Df


In [1033]:
# team_championships = []
# nba_champions = scrape_champions()

# for i, row in final_df.iterrows():
#     team = row['team']  
#     season = row['Season']     

#     nb_interval = len(nba_champions[
#         (nba_champions['team'] == team) & 
#         (nba_champions['Year'] <= season) & 
#         (nba_champions['Year'] > season-4)
#         ]
#     )
    
#     nb_past_championships = len(nba_champions[(nba_champions['team'] == team) & 
#                                                 (nba_champions['Year'] <= season) & 
#                                                 (nba_champions['Year'] > season-4)])
    
#     team_championships.append(nb_past_championships)


# final_df['nb_champ_past_4y'] = team_championships

# column_to_move = final_df.pop("nb_champ_past_4y")

# final_df.insert(15, "nb_champ_past_4y", column_to_move)

In [1057]:
# final_df

In [1056]:
# final_df.to_csv(f"data/2015_2025_avg_odds_salary_players_champ_rk.csv")