In [1]:
years = list(range(2004,2025))

In [2]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [3]:
import requests

for year in years:
    url = url_start.format(year)
    data = requests.get(url)

    with open("mvp/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [4]:
from bs4 import BeautifulSoup

In [5]:
with open("mvp/2004.html") as f:
    page = f.read()

In [6]:
soup = BeautifulSoup(page, "html.parser")

In [7]:
soup.find('tr', class_="over_header").decompose()

In [8]:
mvp_table = soup.find_all(id="mvp")

In [9]:
import pandas as pd
from io import StringIO

In [10]:
mvp_2004 = pd.read_html(StringIO(str(mvp_table)))[0]

In [11]:
mvp_2004.head(1)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Kevin Garnett,27,MIN,120,1219,1230,0.991,82,39.4,24.2,13.9,5.0,1.5,2.2,0.499,0.256,0.791,18.3,0.272


In [12]:
mvp_2004["Year"] = 2004

In [13]:
dfs = []
for year in years:
    with open("mvp/{}.html".format(year)) as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")
    mvp = pd.read_html(StringIO(str(mvp_table)))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)

In [14]:
mvps = pd.concat(dfs)

In [15]:
mvps.tail(5)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
4,5,Jalen Brunson,27,NYK,0,142,990,0.143,77,35.4,...,3.6,6.7,0.9,0.2,0.479,0.401,0.847,11.2,0.198,2024
5,6,Jayson Tatum,25,BOS,0,86,990,0.087,74,35.7,...,8.1,4.9,1.0,0.6,0.471,0.376,0.833,10.4,0.189,2024
6,7,Anthony Edwards,22,MIN,0,18,990,0.018,79,35.1,...,5.4,5.1,1.3,0.5,0.461,0.357,0.836,7.5,0.13,2024
7,8,Domantas Sabonis,27,SAC,0,3,990,0.003,82,35.7,...,13.7,8.2,0.9,0.6,0.594,0.379,0.704,12.6,0.206,2024
8,9,Kevin Durant,35,PHO,0,1,990,0.001,75,37.2,...,6.6,5.0,0.9,1.2,0.523,0.413,0.856,8.3,0.142,2024


In [16]:
mvps.to_csv("mvps.csv")

In [17]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

url = player_stats_url.format(2004)
data = requests.get(url)
with open("player/2004.html", "w+") as f:
    f.write(data.text)

In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [19]:
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [None]:
for year in years:
    url = player_stats_url.format(year)
    
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    html = driver.page_source
    with open("player/{}.html".format(year), "w+") as f:
        f.write(html)

In [21]:
dfs = []
for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")
    player = pd.read_html(StringIO(str(player_table)))[0]
    player["Year"] = year
    dfs.append(player)

In [22]:
players = pd.concat(dfs)

In [23]:
players.to_csv("players.csv")

In [24]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [25]:
for year in years: 
    url = team_stats_url.format(year)
    
    data = requests.get(url)
    
    with open("team/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [26]:
dfs = []
for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    team_table = soup.find_all(id="divs_standings_E")
    team = pd.read_html(StringIO(str(team_table)))[0]
    team["Year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    team_table = soup.find_all(id="divs_standings_W")
    team = pd.read_html(StringIO(str(team_table)))[0]
    team["Year"] = year
    team["Team"] = team["Western Conference"]
    del team["Western Conference"]
    dfs.append(team)

In [27]:
teams = pd.concat(dfs)

In [28]:
teams.to_csv("teams.csv")