In [None]:
# !pip install nba_api
import pandas as pd
from nba_api.stats.static import teams as nba_teams
from nba_api.stats.endpoints import commonteamroster, leaguegamefinder, leaguestandingsv3
import requests
from bs4 import BeautifulSoup

### Team Stats

In [2]:
teams = pd.DataFrame.from_records(nba_teams.get_teams())
teams = teams.merge(
    leaguestandingsv3.LeagueStandingsV3("00", "2024").standings.get_data_frame()[["TeamID", "Conference", "Division"]],
    left_on="id", right_on="TeamID"
).drop(columns="TeamID")
teams

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded,Conference,Division
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949,East,Southeast
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946,East,Atlantic
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970,East,Central
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002,West,Southwest
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966,East,Central
5,1610612742,Dallas Mavericks,DAL,Mavericks,Dallas,Texas,1980,West,Southwest
6,1610612743,Denver Nuggets,DEN,Nuggets,Denver,Colorado,1976,West,Northwest
7,1610612744,Golden State Warriors,GSW,Warriors,Golden State,California,1946,West,Pacific
8,1610612745,Houston Rockets,HOU,Rockets,Houston,Texas,1967,West,Southwest
9,1610612746,Los Angeles Clippers,LAC,Clippers,Los Angeles,California,1970,West,Pacific


In [3]:
teams.to_csv("./Data/teams.csv", index=False)

### Players' Sizes and Ages

In [4]:
players = pd.concat(
    [commonteamroster.CommonTeamRoster(tid, year).common_team_roster.get_data_frame()[["TeamID", "SEASON", "PLAYER", "POSITION", "HEIGHT", "WEIGHT", "BIRTH_DATE", "PLAYER_ID"]]
     for tid in teams.id for year in [2020, 2021, 2022, 2023, 2024]],
     ignore_index=True
)
players.head()

Unnamed: 0,TeamID,SEASON,PLAYER,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_ID
0,1610612737,2020,Brandon Goodwin,G,6-0,180,"OCT 02, 1995",1629164
1,1610612737,2020,Nathan Knight,F-C,6-10,253,"SEP 20, 1997",1630233
2,1610612737,2020,Kevin Huerter,G-F,6-7,190,"AUG 27, 1998",1628989
3,1610612737,2020,Skylar Mays,G,6-4,205,"SEP 05, 1997",1630219
4,1610612737,2020,Lou Williams,G,6-1,175,"OCT 27, 1986",101150


In [11]:
players["BIRTH_DATE"] = pd.to_datetime(players["BIRTH_DATE"])
players.to_csv("./Data/players.csv", index=False)

  players["BIRTH_DATE"] = pd.to_datetime(players["BIRTH_DATE"])


### Game Stats

In [6]:
reg_season = leaguegamefinder.LeagueGameFinder(league_id_nullable="00", season_type_nullable="Regular Season").league_game_finder_results.get_data_frame()
reg_season = reg_season.loc[(reg_season.SEASON_ID.str[-4:].isin(["2020", "2021", "2022", "2023", "2024"]))&(reg_season.MATCHUP.str.contains("@"))]
reg_season = reg_season[["SEASON_ID", "GAME_ID", "GAME_DATE", "MATCHUP", "PTS", "PLUS_MINUS"]]
reg_season["HOME_TEAM_ABBR"] = reg_season.MATCHUP.str[-3:]
reg_season["AWAY_TEAM_ABBR"] = reg_season.MATCHUP.str[:3]
reg_season["HOME_POINTS"] = reg_season.PTS + reg_season.PLUS_MINUS
reg_season.rename(columns={"PTS": "AWAY_POINTS"}, inplace=True)
reg_season["IS_PLAYOFFS"] = False
reg_season["SEASON_ID"] = reg_season.SEASON_ID.str[1:]
reg_season = reg_season[["SEASON_ID", "GAME_ID", "GAME_DATE", "HOME_TEAM_ABBR", "AWAY_TEAM_ABBR", "HOME_POINTS", "AWAY_POINTS", "IS_PLAYOFFS"]]

playoffs = leaguegamefinder.LeagueGameFinder(league_id_nullable="00", season_type_nullable="Playoffs").get_data_frames()[0]
playoffs = playoffs.loc[(playoffs.SEASON_ID.str[-4:].isin(["2020", "2021", "2022", "2023", "2024"]))&(playoffs.MATCHUP.str.contains("@"))]
playoffs = playoffs[["SEASON_ID", "GAME_ID", "GAME_DATE", "MATCHUP", "PTS", "PLUS_MINUS"]]
playoffs["HOME_TEAM_ABBR"] = playoffs.MATCHUP.str[-3:]
playoffs["AWAY_TEAM_ABBR"] = playoffs.MATCHUP.str[:3]
playoffs["HOME_POINTS"] = playoffs.PTS + playoffs.PLUS_MINUS
playoffs.rename(columns={"PTS": "AWAY_POINTS"}, inplace=True)
playoffs["IS_PLAYOFFS"] = True
playoffs["SEASON_ID"] = playoffs.SEASON_ID.str[1:]
playoffs = playoffs[["SEASON_ID", "GAME_ID", "GAME_DATE", "HOME_TEAM_ABBR", "AWAY_TEAM_ABBR", "HOME_POINTS", "AWAY_POINTS", "IS_PLAYOFFS"]]

games = pd.concat([reg_season, playoffs], ignore_index=True)
games.head()

Unnamed: 0,SEASON_ID,GAME_ID,GAME_DATE,HOME_TEAM_ABBR,AWAY_TEAM_ABBR,HOME_POINTS,AWAY_POINTS,IS_PLAYOFFS
0,2024,22401191,2025-04-13,PHI,CHI,142.0,122,False
1,2024,22401199,2025-04-13,POR,LAL,53.0,81,False
2,2024,22401197,2025-04-13,SAS,TOR,111.0,118,False
3,2024,22401187,2025-04-13,BOS,CHA,79.0,86,False
4,2024,22401190,2025-04-13,MIA,WAS,120.0,119,False


In [7]:
games.to_csv("./Data/games.csv", index=False)

### Player Game Stats

In [43]:
players_game_stats = pd.concat([
    pd.concat([
        leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation="P", league_id_nullable="00", season_type_nullable="Regular Season", season_nullable=season)\
        .league_game_finder_results.get_data_frame()[[
            "PLAYER_ID", "GAME_ID", 
            "MIN", "PTS", "FGM", "FG_PCT", "FG3M", "FG3_PCT",
            "FTM", "FT_PCT", "REB", "AST", "STL", "BLK", "TOV"
        ]]
        for season in ["2020-21", "2021-22", "2022-23", "2023-24", "2024-25"]],
    ignore_index=True),
    pd.concat([
        leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation="P", league_id_nullable="00", season_type_nullable="Playoffs", season_nullable=season)\
        .league_game_finder_results.get_data_frame()[[
            "PLAYER_ID", "GAME_ID", 
            "MIN", "PTS", "FGM", "FG_PCT", "FG3M", "FG3_PCT",
            "FTM", "FT_PCT", "REB", "AST", "STL", "BLK", "TOV"
        ]]
        for season in ["2020-21", "2021-22", "2022-23", "2023-24", "2024-25"]],
    ignore_index=True),
], ignore_index=True)
# players_game_stats["SEASON_ID"] = players_game_stats.SEASON_ID.str[1:]
players_game_stats["PLAYER_ID"] = players_game_stats.PLAYER_ID.astype("str").str.rstrip(".0")
players_game_stats.head()

Unnamed: 0,PLAYER_ID,GAME_ID,MIN,PTS,FGM,FG_PCT,FG3M,FG3_PCT,FTM,FT_PCT,REB,AST,STL,BLK,TOV
0,162899,22001080,10,0,0,0.0,0,0.0,0,,0,0,0,0,0
1,1629023,22001080,30,11,4,0.4,3,0.333,0,,9,4,2,1,2
2,1630173,22001069,42,23,10,0.625,0,,3,0.429,10,0,2,1,2
3,203932,22001076,12,2,1,0.25,0,0.0,0,,2,0,0,0,0
4,1626181,22001076,33,19,5,0.556,3,0.6,6,1.0,3,1,0,0,2


In [44]:
players_game_stats.shape

(136668, 15)

In [45]:
players_game_stats.to_csv("./Data/players_game_stats.csv", index=False)

### Team Season Stats

In [118]:
dfs = []
for season in ["2020-21", "2021-22", "2022-23", "2023-24", "2024-25"]:
    temp = leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation="T", league_id_nullable="00", season_type_nullable="Regular Season", season_nullable=season)\
        .league_game_finder_results.get_data_frame()[[
            "TEAM_ID", "SEASON_ID", "WL", "PTS", "PLUS_MINUS"
        ]]
    temp["WINS"] = temp["WL"] == "W"
    temp["LOSSES"] = temp["WL"] == "L"
    temp["PLUS_MINUS"] = temp["PTS"] + temp["PLUS_MINUS"].astype("int")
    temp = temp.groupby(["TEAM_ID", "SEASON_ID"])[["WINS", "LOSSES"]].sum().reset_index()
    dfs.append(temp)
teams_game_stats = pd.concat(dfs, ignore_index=True)
teams_game_stats["SEASON_ID"] = teams_game_stats.SEASON_ID.str[1:]
teams_game_stats.head()

Unnamed: 0,TEAM_ID,SEASON_ID,WINS,LOSSES
0,1610612737,2020,41,31
1,1610612738,2020,36,36
2,1610612739,2020,22,50
3,1610612740,2020,31,41
4,1610612741,2020,31,41


In [119]:
dfs = []
for season in ["2020-21", "2021-22", "2022-23", "2023-24", "2024-25"]:
    temp = leaguegamefinder.LeagueGameFinder(player_or_team_abbreviation="T", league_id_nullable="00", season_type_nullable="Playoffs", season_nullable=season)\
        .league_game_finder_results.get_data_frame()[[
            "TEAM_ID", "SEASON_ID"
        ]]
    dfs.append(temp)
playoff_teams = pd.concat(dfs, ignore_index=True)
playoff_teams["SEASON_ID"] = playoff_teams.SEASON_ID.str[1:]
teams_game_stats = teams_game_stats.merge(playoff_teams.groupby("SEASON_ID").TEAM_ID.unique(),
                       on="SEASON_ID")
teams_game_stats.head()

Unnamed: 0,TEAM_ID_x,SEASON_ID,WINS,LOSSES,TEAM_ID_y
0,1610612737,2020,41,31,"[1610612756, 1610612749, 1610612737, 161061274..."
1,1610612738,2020,36,36,"[1610612756, 1610612749, 1610612737, 161061274..."
2,1610612739,2020,22,50,"[1610612756, 1610612749, 1610612737, 161061274..."
3,1610612740,2020,31,41,"[1610612756, 1610612749, 1610612737, 161061274..."
4,1610612741,2020,31,41,"[1610612756, 1610612749, 1610612737, 161061274..."


In [126]:
teams_game_stats["made_playoffs"] = [x in y for x, y in zip(teams_game_stats.TEAM_ID_x, teams_game_stats.TEAM_ID_y)]
teams_game_stats = teams_game_stats.rename(columns={
    "TEAM_ID_x": "team_id",
    "SEASON_ID": "season_id", 
    "WINS": "wins",
    "LOSSES": "losses"
}).drop(columns=["TEAM_ID_y"])
teams_game_stats.head()

Unnamed: 0,team_id,season_id,wins,losses,made_playoffs
0,1610612737,2020,41,31,True
1,1610612738,2020,36,36,True
2,1610612739,2020,22,50,False
3,1610612740,2020,31,41,False
4,1610612741,2020,31,41,False


In [127]:
teams_game_stats.to_csv("./Data/team_season_stats.csv", index=False)

### Salaries

In [8]:
base_url = "https://www.espn.com/nba/salaries/"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
dfs = []
for season in [2021, 2022, 2023, 2024, 2025]: # season numbering is different than nba_api's 
    counter = 1
    while counter > 0:
        if counter == 1:
            url = f"{base_url}_/year/{season}/seasontype/1"
            counter += 1
        else:
            url = f"{base_url}_/year/{season}/page/{counter}/seasontype/1"
            counter += 1
        
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text)
        df = pd.DataFrame.from_records(
            [dict(zip(["Rank", "Name", "Team", "Salary"], [y.text for y in x.find_all("td")])) 
            for x in soup.find_all("table")[0].find_all("tr") if x.text != "RKNAMETEAMSALARY"])
        if len(df) > 0:
            df["Season"] = season - 1 # to match with nba_api season_id
            dfs.append(df)
        else:
            counter = 0
salaries = pd.concat(dfs, ignore_index=True)
salaries["Position"] = salaries["Name"].map(lambda x: x.split(", ")[-1])
salaries["Name"] = salaries["Name"].map(lambda x: x.split(", ")[0])
salaries["Salary"] = salaries["Salary"].map(lambda x: int(x.replace("$", "").replace(",", "")))
salaries.head()        

Unnamed: 0,Rank,Name,Team,Salary,Season,Position
0,1,Stephen Curry,Golden State Warriors,43006362,2020,PG
1,2,Russell Westbrook,Washington Wizards,41358814,2020,PG
2,3,Chris Paul,Phoenix Suns,41358814,2020,PG
3,4,James Harden,Brooklyn Nets,41254920,2020,SG
4,5,John Wall,Houston Rockets,41254920,2020,PG


In [9]:
salaries.to_csv("./Data/salaries.csv", index=False)

In [18]:
salaries["Season"] = salaries.Season.astype("str")
player_salaries = salaries.merge(players, left_on=["Name", "Season"], right_on=["PLAYER", "SEASON"])[["SEASON", "PLAYER_ID", "Salary"]]
player_salaries.head()

Unnamed: 0,SEASON,PLAYER_ID,Salary
0,2020,201939,43006362
1,2020,201566,41358814
2,2020,101108,41358814
3,2020,201935,41254920
4,2020,202322,41254920


In [17]:
player_salaries.to_csv("./Data/player_salaries.csv", index=False)