### Importing necessary packages and libraries

In [5]:
from selenium import webdriver 
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from bs4 import Comment
from pprint import pprint
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import requests


### Using Requests and BeautifulSoup to scrape NBA data for players

In [2]:
years = list(range(2000, 2024))
ppg_url = "https://www.basketball-reference.com/leagues/NBA_{}_ppg.html"
advanced_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"
standings_url = "https://www.basketball-reference.com/leagues/NBA_{}.html"


In [47]:
# Scrape MVP awards table from 2000-2023 and save as an HTML file
for year in years:
    base_url = f"https://www.basketball-reference.com/awards/awards_{year}.html"
    data = requests.get(base_url)

    with open(f'MVP_Data/{year}.html', 'w+', encoding="utf-8") as f:
        f.write(data.text)

### Extracting MVP candidates from 2000-2023

In [44]:
# Open each HTML file, parse out the table, create pandas dataframe
mvp_list = []
for year in years:
    with open(f"MVP_data/{year}.html", errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        remove_header = soup.find("tr", class_='over_header').decompose()
        mvp_table = soup.find(id='mvp')
        mvp_df = pd.read_html(str(mvp_table))[0]
        mvp_df['Year'] = year
        mvp_list.append(mvp_df)

mvp_data = pd.concat(mvp_list)
mvp_data.reset_index(drop=True)
mvp_data.to_csv(f'MVP_data/mvp_awards.csv')

### Extracting ROY candidates from 2000-2023

In [26]:
roy_list = []
for year in years:
    with open(f"MVP_data/{year}.html", errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        remove_header = soup.find("tr", class_='over_header').decompose()
        roy_table = soup.find(id='roy')
        roy_df = pd.read_html(str(roy_table),header=1)[0]
        roy_df['Year'] = year
        roy_list.append(roy_df)

roy_data = pd.concat(roy_list)
roy_data.reset_index(drop=True)
roy_data.to_csv(f'MVP_data/roy_awards.csv')

### Extracting DPOY candidates from 2000-2023

In [10]:
# Open each HTML file, parse out the table, create pandas dataframe
dpoy_list = []
for year in years:
    with open(f'MVP_data/{year}.html', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soupTables = BeautifulSoup(''.join(soup.find_all(string=lambda text: isinstance(text, Comment) and '<table' in text)))
        soupTables.find("tr", class_="over_header").decompose()
        dpoy_table = soupTables.find('table', id="dpoy")
        dpoy_df = pd.read_html(str(dpoy_table))[0]
        dpoy_df['Year'] = year
        dpoy_list.append(dpoy_df)

dpoy_data = pd.concat(dpoy_list)
dpoy_data.reset_index(drop=True)
dpoy_data.to_csv('MVP_data/dpoy_awards.csv')

### Extracting SMOY candidates from 2000-2023

In [43]:
smoy_list = []
for year in years:
    with open(f'MVP_data/{year}.html', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soupTables = BeautifulSoup(''.join(soup.find_all(string=lambda text: isinstance(text, Comment) and '<table' in text)))
        smoy_table = soupTables.find('table', id="smoy")
        smoy_df = pd.read_html(str(smoy_table), header=1)[0]
        smoy_df['Year'] = year
        smoy_list.append(smoy_df)

smoy_data = pd.concat(smoy_list)
smoy_data.reset_index(drop=True)
smoy_data.to_csv('MVP_data/smoy_awards.csv')

### Using Selenium to extract player PPG, Advanced, and Team Record stats

In [3]:
def stats_scraping(years, url, metric=""):
    '''
    
    '''

    driver = webdriver.Chrome()
    driver.maximize_window()

    for year in years:
        driver.get(url.format(year))
        time.sleep(5)
        pagesource = driver.page_source 

        with open(f'NBA_Stats/{metric}_{year}.html', 'w', encoding="utf-8") as f:
            f.write(pagesource)
        print(f"NBA Season {year} successfully saved.")
    

### Scraping Per Game stats

In [114]:
stats_scraping(years, ppg_url, "ppg")

NBA Season 2000 successfully saved.
NBA Season 2001 successfully saved.
NBA Season 2002 successfully saved.
NBA Season 2003 successfully saved.
NBA Season 2004 successfully saved.
NBA Season 2005 successfully saved.
NBA Season 2006 successfully saved.
NBA Season 2007 successfully saved.
NBA Season 2008 successfully saved.
NBA Season 2009 successfully saved.
NBA Season 2010 successfully saved.
NBA Season 2011 successfully saved.
NBA Season 2012 successfully saved.
NBA Season 2013 successfully saved.
NBA Season 2014 successfully saved.
NBA Season 2015 successfully saved.
NBA Season 2016 successfully saved.
NBA Season 2017 successfully saved.
NBA Season 2018 successfully saved.
NBA Season 2019 successfully saved.
NBA Season 2020 successfully saved.
NBA Season 2021 successfully saved.
NBA Season 2022 successfully saved.
NBA Season 2023 successfully saved.


### Scraping Advanced stats

In [6]:
stats_scraping(years, advanced_stats_url, "advanced_stats")

NBA Season 2000 successfully saved.
NBA Season 2001 successfully saved.
NBA Season 2002 successfully saved.
NBA Season 2003 successfully saved.
NBA Season 2004 successfully saved.
NBA Season 2005 successfully saved.
NBA Season 2006 successfully saved.
NBA Season 2007 successfully saved.
NBA Season 2008 successfully saved.
NBA Season 2009 successfully saved.
NBA Season 2010 successfully saved.
NBA Season 2011 successfully saved.
NBA Season 2012 successfully saved.
NBA Season 2013 successfully saved.
NBA Season 2014 successfully saved.
NBA Season 2015 successfully saved.
NBA Season 2016 successfully saved.
NBA Season 2017 successfully saved.
NBA Season 2018 successfully saved.
NBA Season 2019 successfully saved.
NBA Season 2020 successfully saved.
NBA Season 2021 successfully saved.
NBA Season 2022 successfully saved.
NBA Season 2023 successfully saved.


### Scraping Team Standing stats

In [4]:
stats_scraping(years, standings_url, "team_standings")

NBA Season 2000 successfully saved.
NBA Season 2001 successfully saved.
NBA Season 2002 successfully saved.
NBA Season 2003 successfully saved.
NBA Season 2004 successfully saved.
NBA Season 2005 successfully saved.
NBA Season 2006 successfully saved.
NBA Season 2007 successfully saved.
NBA Season 2008 successfully saved.
NBA Season 2009 successfully saved.
NBA Season 2010 successfully saved.
NBA Season 2011 successfully saved.
NBA Season 2012 successfully saved.
NBA Season 2013 successfully saved.
NBA Season 2014 successfully saved.
NBA Season 2015 successfully saved.
NBA Season 2016 successfully saved.
NBA Season 2017 successfully saved.
NBA Season 2018 successfully saved.
NBA Season 2019 successfully saved.
NBA Season 2020 successfully saved.
NBA Season 2021 successfully saved.
NBA Season 2022 successfully saved.
NBA Season 2023 successfully saved.


In [None]:
ppg_list = []
for year in years:
    with open(f'NBA_Stats/ppg_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        for item in soup.find_all("tr", class_="thead"):
            item.decompose()
        ppg_table = soup.find(id="per_game_stats")
        ppg_df = pd.read_html(str(ppg_table))[0]
        ppg_df['Year'] = year
        ppg_list.append(ppg_df)

ppg_data = pd.concat(ppg_list)
ppg_data.reset_index(drop=True)
ppg_data.to_csv('NBA_Stats/ppg_data.csv')

In [14]:
advanced_stats_list = []
for year in years:
    with open(f'NBA_Stats/advanced_stats_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        for item in soup.find_all("tr", class_="thead"):
            item.decompose()
        advanced_stats_table = soup.find(id="advanced_stats")
        advanced_stats_df = pd.read_html(str(advanced_stats_table))[0]
        advanced_stats_df['Year'] = year
        advanced_stats_list.append(advanced_stats_df)

advanced_stats_data = pd.concat(advanced_stats_list)
advanced_stats_data.reset_index(drop=True)
advanced_stats_data.to_csv('NBA_Stats/advanced_stats_data.csv')

In [6]:
with open(f'NBA_Stats/team_standings_2000.html', 'r', errors="ignore") as f:
    page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    

In [9]:
soup.find("tr", class_='over_header').decompose()


In [10]:
team_standings_table = soup.find(id="advanced-team")

In [15]:
pd.read_html(str(team_standings_table))[0]

Unnamed: 0,Rk,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,FT/FGA,Unnamed: 22,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Unnamed: 27,Arena,Attend.,Attend./G
0,1.0,Los Angeles Lakers*,29.2,67.0,15.0,64,18,8.55,-0.14,8.41,...,0.241,,0.443,13.4,73.1,0.222,,STAPLES Center,771420,18815
1,2.0,Portland Trail Blazers*,29.6,59.0,23.0,59,23,6.4,-0.04,6.36,...,0.24,,0.461,13.8,72.4,0.217,,Rose Garden Arena,835078,20368
2,3.0,San Antonio Spurs*,30.9,53.0,29.0,58,24,5.94,-0.02,5.92,...,0.258,,0.451,13.5,73.0,0.188,,Alamodome,884450,21694
3,4.0,Phoenix Suns*,28.6,53.0,29.0,56,26,5.22,0.02,5.24,...,0.217,,0.454,15.7,70.5,0.245,,America West Arena,773115,18856
4,5.0,Utah Jazz*,31.5,55.0,27.0,54,28,4.46,0.05,4.52,...,0.26,,0.477,15.0,73.2,0.256,,Delta Center,801268,19543
5,6.0,Indiana Pacers*,30.4,56.0,26.0,54,28,4.6,-0.45,4.15,...,0.245,,0.469,12.6,71.5,0.197,,Conseco Fieldhouse,752145,18345
6,7.0,Miami Heat*,29.2,52.0,30.0,50,32,3.11,-0.36,2.75,...,0.208,,0.453,13.3,72.9,0.229,,AmericanAirlines Arena,706725,18374
7,8.0,Sacramento Kings*,27.5,44.0,38.0,49,33,2.91,0.12,3.04,...,0.209,,0.479,15.1,69.7,0.198,,ARCO Arena (II),720033,17562
8,9.0,Charlotte Hornets*,28.7,49.0,33.0,49,33,2.67,-0.34,2.33,...,0.285,,0.478,14.6,73.2,0.198,,Charlotte Coliseum,732827,17874
9,10.0,Minnesota Timberwolves*,26.6,50.0,32.0,48,34,2.52,0.14,2.67,...,0.2,,0.474,14.0,73.0,0.25,,Target Center,690012,16830


In [38]:
team_stats_list = []
for year in years:
    with open(f'NBA_Stats/team_standings_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soup.find("tr", class_="over_header").decompose()
        team_standings_table = soup.find(id="advanced-team")
        team_standings_df = pd.read_html(str(team_standings_table))[0]
        team_standings_df['Year'] = year
        team_stats_list.append(team_standings_df)

team_standings_data = pd.concat(team_stats_list)
team_standings_data = team_standings_data[team_standings_data['Team'] != "League Average"]
team_standings_data = team_standings_data[['Rk', 'Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORtg',
       'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'eFG%',
       'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1', 'DRB%',
       'FT/FGA.1', 'Arena', 'Attend.', 'Attend./G', 'Year']]

team_standings_data['W'] = team_standings_data['W'].astype('int64')
team_standings_data['L'] = team_standings_data['W'].astype('int64')
team_standings_data.reset_index(drop=True, inplace=True)

team_standings_data.to_csv('NBA_Stats/team_standings.csv')