### Importing necessary packages and libraries

In [57]:
from selenium import webdriver 
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from bs4 import Comment
from pprint import pprint
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import requests


### Using Requests and BeautifulSoup to scrape NBA data for players

In [119]:
years = list(range(2000, 2003))
ppg_url = "https://www.basketball-reference.com/leagues/NBA_{}_ppg.html"
advanced_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"


In [47]:
# Scrape MVP awards table from 2000-2023 and save as an HTML file
for year in years:
    base_url = f"https://www.basketball-reference.com/awards/awards_{year}.html"
    data = requests.get(base_url)

    with open(f'MVP_Data/{year}.html', 'w+', encoding="utf-8") as f:
        f.write(data.text)

### Extracting MVP candidates from 2000-2023

In [44]:
# Open each HTML file, parse out the table, create pandas dataframe
mvp_list = []
for year in years:
    with open(f"MVP_data/{year}.html", errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        remove_header = soup.find("tr", class_='over_header').decompose()
        mvp_table = soup.find(id='mvp')
        mvp_df = pd.read_html(str(mvp_table))[0]
        mvp_df['Year'] = year
        mvp_list.append(mvp_df)

mvp_data = pd.concat(mvp_list)
mvp_data.reset_index(drop=True)
mvp_data.to_csv(f'MVP_data/mvp_awards.csv')

### Extracting ROY candidates from 2000-2023

In [26]:
roy_list = []
for year in years:
    with open(f"MVP_data/{year}.html", errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        remove_header = soup.find("tr", class_='over_header').decompose()
        roy_table = soup.find(id='roy')
        roy_df = pd.read_html(str(roy_table),header=1)[0]
        roy_df['Year'] = year
        roy_list.append(roy_df)

roy_data = pd.concat(roy_list)
roy_data.reset_index(drop=True)
roy_data.to_csv(f'MVP_data/roy_awards.csv')

### Extracting DPOY candidates from 2000-2023

In [10]:
# Open each HTML file, parse out the table, create pandas dataframe
dpoy_list = []
for year in years:
    with open(f'MVP_data/{year}.html', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soupTables = BeautifulSoup(''.join(soup.find_all(string=lambda text: isinstance(text, Comment) and '<table' in text)))
        soupTables.find("tr", class_="over_header").decompose()
        dpoy_table = soupTables.find('table', id="dpoy")
        dpoy_df = pd.read_html(str(dpoy_table))[0]
        dpoy_df['Year'] = year
        dpoy_list.append(dpoy_df)

dpoy_data = pd.concat(dpoy_list)
dpoy_data.reset_index(drop=True)
dpoy_data.to_csv('MVP_data/dpoy_awards.csv')

### Extracting SMOY candidates from 2000-2023

In [43]:
smoy_list = []
for year in years:
    with open(f'MVP_data/{year}.html', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soupTables = BeautifulSoup(''.join(soup.find_all(string=lambda text: isinstance(text, Comment) and '<table' in text)))
        smoy_table = soupTables.find('table', id="smoy")
        smoy_df = pd.read_html(str(smoy_table), header=1)[0]
        smoy_df['Year'] = year
        smoy_list.append(smoy_df)

smoy_data = pd.concat(smoy_list)
smoy_data.reset_index(drop=True)
smoy_data.to_csv('MVP_data/smoy_awards.csv')

### Using Selenium to extract player PPG, Advanced, and Team Record stats

In [114]:
driver = webdriver.Chrome()
driver.maximize_window()

for year in years:
    driver.get(f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html")
    time.sleep(5)
    pagesource = driver.page_source 

    with open(f'NBA_Stats/ppg_{year}.html', 'w', encoding="utf-8") as f:
        f.write(pagesource)
    print(f"NBA Season {year} successfully saved.")
    

NBA Season 2000 successfully saved.
NBA Season 2001 successfully saved.
NBA Season 2002 successfully saved.
NBA Season 2003 successfully saved.
NBA Season 2004 successfully saved.
NBA Season 2005 successfully saved.
NBA Season 2006 successfully saved.
NBA Season 2007 successfully saved.
NBA Season 2008 successfully saved.
NBA Season 2009 successfully saved.
NBA Season 2010 successfully saved.
NBA Season 2011 successfully saved.
NBA Season 2012 successfully saved.
NBA Season 2013 successfully saved.
NBA Season 2014 successfully saved.
NBA Season 2015 successfully saved.
NBA Season 2016 successfully saved.
NBA Season 2017 successfully saved.
NBA Season 2018 successfully saved.
NBA Season 2019 successfully saved.
NBA Season 2020 successfully saved.
NBA Season 2021 successfully saved.
NBA Season 2022 successfully saved.
NBA Season 2023 successfully saved.


In [115]:
ppg_list = []
for year in years:
    with open(f'NBA_Stats/ppg_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        for item in soup.find_all("tr", class_="thead"):
            item.decompose()
        ppg_table = soup.find(id="per_game_stats")
        ppg_df = pd.read_html(str(ppg_table))[0]
        ppg_df['Year'] = year
        ppg_list.append(ppg_df)

ppg_data = pd.concat(ppg_list)
ppg_data.reset_index(drop=True)
ppg_data.to_csv('NBA_Stats/ppg_data.csv')


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Tariq Abdul-Wahad,SG,25,TOT,61,56,25.9,4.5,10.6,...,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4,2000
1,1,Tariq Abdul-Wahad,SG,25,ORL,46,46,26.2,4.8,11.2,...,1.7,3.5,5.2,1.6,1.2,0.3,1.9,2.5,12.2,2000
2,1,Tariq Abdul-Wahad,SG,25,DEN,15,10,24.9,3.4,8.7,...,1.6,1.9,3.5,1.7,0.4,0.8,1.3,2.1,8.9,2000
3,2,Shareef Abdur-Rahim,SF,23,VAN,82,82,39.3,7.2,15.6,...,2.7,7.4,10.1,3.3,1.1,1.1,3.0,3.0,20.3,2000
4,3,Cory Alexander,PG,26,DEN,29,2,11.3,1.0,3.4,...,0.3,1.2,1.4,2.0,0.8,0.1,1.0,1.3,2.8,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14418,535,Thaddeus Young,PF,34,TOR,54,9,14.7,2.0,3.7,...,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4,2023
14419,536,Trae Young,PG,24,ATL,73,73,34.8,8.2,19.0,...,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2,2023
14420,537,Omer Yurtseven,C,24,MIA,9,0,9.2,1.8,3.0,...,0.9,1.7,2.6,0.2,0.2,0.2,0.4,1.8,4.4,2023
14421,538,Cody Zeller,C,30,MIA,15,2,14.5,2.5,3.9,...,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5,2023


In [123]:
def stats_scraping(years, url, metric=""):
    '''
    
    '''

    driver = webdriver.Chrome()
    driver.maximize_window()

    for year in years:
        driver.get(url.format(year))
        time.sleep(5)
        pagesource = driver.page_source 

        with open(f'NBA_Stats/{metric}_{year}.html', 'w', encoding="utf-8") as f:
            f.write(pagesource)
        print(f"NBA Season {year} successfully saved.")
    

In [124]:
stats_scraping(years, advanced_stats_url, "advanced_stats")

NBA Season 2000 successfully saved.
NBA Season 2001 successfully saved.
NBA Season 2002 successfully saved.
