In [1]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

**Data for per game stats for players from the 2024 season.**

In [2]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_2024_per_game.html"
data = requests.get(player_stats_url)
time.sleep(15)
with open("../player_stats/2024.html", "w+", encoding="utf-8") as f:
    f.write(data.text)

In [3]:
driver = webdriver.Chrome(
    # service=Service(executable_path="C:/Users/vuyyu/Dropbox/PC/Desktop/Program_Installers/chromedriver-win64/chromedriver-win64/chromedriver.exe")
    )

In [4]:
url = player_stats_url

driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(2)

with open("../player_stats/2024.html", "w+", encoding='utf-8') as f:
    f.write(driver.page_source)

In [5]:
dfs = []
with open("../player_stats/2024.html", encoding='utf-8') as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="thead").decompose()
player_table = soup.find_all(id="per_game_stats")[0]
player_df = pd.read_html(str(player_table))[0]
player_df["Year"] = 2024
dfs.append(player_df)

In [6]:
players_2024 = pd.concat(dfs)
players_2024.to_csv("../player_stats/2024_players.csv")

**Data for advanced stats for players from the 2024 season.**

In [7]:
advanced_stats_url = "https://www.basketball-reference.com/leagues/NBA_2024_advanced.html"

data = requests.get(advanced_stats_url)
time.sleep(15)

with open("../advanced_stats/2024.html", "w+", encoding='utf-8') as f:
    f.write(data.text)

In [8]:
url = advanced_stats_url

driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(2)

with open("../advanced_stats/2024.html", "w+", encoding='utf-8') as f:
    f.write(driver.page_source)

In [9]:
dfs = []
with open("../advanced_stats/2024.html", encoding='utf-8') as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="thead").decompose()
player_table = soup.find_all(id="advanced_stats")[0]
player_df = pd.read_html(str(player_table))[0]
player_df["Year"] = 2024
dfs.append(player_df)

In [10]:
players_advanced = pd.concat(dfs)
players_advanced.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,Year
0,1,Precious Achiuwa,PF-C,24,TOT,73,1605,14.7,0.544,0.208,...,1.3,2.2,3.4,0.103,,-1.7,0.3,-1.4,0.2,2024
1,1,Precious Achiuwa,C,24,TOR,25,437,15.0,0.512,0.276,...,0.0,0.4,0.4,0.048,,-1.4,-0.2,-1.7,0.0,2024
2,1,Precious Achiuwa,PF,24,NYK,48,1168,14.6,0.563,0.169,...,1.2,1.8,3.0,0.123,,-1.8,0.5,-1.3,0.2,2024
3,2,Bam Adebayo,C,26,MIA,70,2397,19.7,0.574,0.041,...,2.8,4.3,7.1,0.142,,0.7,1.6,2.4,2.7,2024
4,3,Ochai Agbaji,SG,23,TOT,77,1609,7.7,0.5,0.493,...,-0.4,0.6,0.2,0.005,,-3.4,-0.9,-4.4,-1.0,2024


In [11]:
del players_advanced["Unnamed: 19"]
del players_advanced["Unnamed: 24"]

In [12]:
players_advanced.to_csv("../advanced_stats/2024_advanced_stats.csv")

**Cleaning and combining advanced stats and regular stats.**

In [13]:
players = pd.read_csv("../player_stats/2024_players.csv")
del players["Rk"]
del players["Unnamed: 0"]

In [14]:
advanced_stats = pd.read_csv("../advanced_stats/2024_advanced_stats.csv")
del advanced_stats["Unnamed: 0"]

In [15]:
players["Player"] = players["Player"].str.replace("*","",regex=False)
advanced_stats["Player"] = advanced_stats["Player"].str.replace("*","",regex=False)

In [16]:
def single_row(df):
    if df.shape[0] == 1:
        return df
    else:
        row = df[df["Tm"] == "TOT"]
        row["Tm"] = df.iloc[-1,:]["Tm"]
        return row
advanced_stats = advanced_stats.groupby(["Player", "Year"]).apply(single_row)
players = players.groupby(["Player", "Year"]).apply(single_row)

In [17]:
advanced_stats.index = advanced_stats.index.droplevel()
advanced_stats.index = advanced_stats.index.droplevel()

In [18]:
players.index = players.index.droplevel()
players.index = players.index.droplevel()

In [19]:
advanced_stats = advanced_stats[['Player','Year','PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]

In [20]:
stats = players.merge(advanced_stats, how="outer", on=["Player", "Year"])

**Team standings**

In [21]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html"

In [22]:
url = team_stats_url

data = requests.get(url)
time.sleep(15)

with open("../team_standings/2024.html", "w+", encoding = 'utf-8') as f:
    f.write(data.text)

In [23]:
dfs = []
with open("../team_standings/2024.html", encoding = 'utf-8') as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="thead").decompose()
e_table = soup.find_all(id="divs_standings_E")[0]
e_df = pd.read_html(str(e_table))[0]
e_df["Year"] = 2024
e_df["Team"] = e_df["Eastern Conference"]
del e_df["Eastern Conference"]
dfs.append(e_df)

w_table = soup.find_all(id="divs_standings_W")[0]
w_df = pd.read_html(str(w_table))[0]
w_df["Year"] = 2024
w_df["Team"] = w_df["Western Conference"]
del w_df["Western Conference"]
dfs.append(w_df)

In [24]:
team_standings = pd.concat(dfs)

In [25]:
team_standings["Team"] = team_standings["Team"].str.replace(r"\(\d+\)", "").str.strip()
team_standings["Team"] = team_standings["Team"].str.replace("*", "", regex=False)

  team_standings["Team"] = team_standings["Team"].str.replace(r"\(\d+\)", "").str.strip()


In [26]:
team_standings.to_csv("../team_standings/2024_team_standings.csv")

**Adding team standings to player dataset**

In [27]:
teams = pd.read_csv("../team_standings/2024_team_standings.csv")
teams = teams[~teams["W"].str.contains("Division")]

In [28]:
nicknames = {}

with open("../team_standings/nicknames.txt") as f:
    lines = f.readlines()
    for line in lines[1:]:
        abbrev, name = line.replace("\n", "").split(",")
        nicknames[abbrev] = name

In [29]:
stats["Team"] = stats["Tm"].map(nicknames)

In [30]:
stats = stats.merge(teams, how = "outer", on = ["Team", "Year"])

In [31]:
del stats["Unnamed: 0"]

In [32]:
stats = stats.apply(pd.to_numeric, errors="ignore")

In [33]:
stats["GB"] = stats["GB"].str.replace("—","0")

In [34]:
stats["GB"] = pd.to_numeric(stats["GB"])

In [35]:
stats.to_csv("../2024_stats.csv")