### Importing necessary packages and libraries

In [1]:
from selenium import webdriver 
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from bs4 import Comment
from pprint import pprint
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import requests


### Using Requests and BeautifulSoup to scrape NBA data for players

In [43]:
years = list(range(1984, 2024))
ppg_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
advanced_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"
standings_url = "https://www.basketball-reference.com/leagues/NBA_{}.html"


In [15]:
# Scrape MVP awards table from 2000-2023 and save as an HTML file
for year in years:
    base_url = f"https://www.basketball-reference.com/awards/awards_{year}.html"
    time.sleep(10)
    data = requests.get(base_url)

    with open(f'MVP_Data/{year}.html', 'w+', encoding="utf-8") as f:
        f.write(data.text)
    

### Extracting MVP candidates from 1984-2023

In [44]:
# Open each HTML file, parse out the table, create pandas dataframe
mvp_list = []
for year in years:
    with open(f"MVP_data/{year}.html", errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")

        remove_header = soup.find("tr", class_='over_header').decompose()
        mvp_table = soup.find(id='mvp')
        mvp_df = pd.read_html(str(mvp_table))[0]
        mvp_df['Year'] = year
        mvp_list.append(mvp_df)

mvp_data = pd.concat(mvp_list)
mvp_data.reset_index(drop=True)
mvp_data.to_csv(f'MVP_data/mvp_awards.csv')

### Extracting ROY candidates from 2000-2023

In [26]:
roy_list = []
for year in years:
    with open(f"MVP_data/{year}.html", errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        remove_header = soup.find("tr", class_='over_header').decompose()
        roy_table = soup.find(id='roy')
        roy_df = pd.read_html(str(roy_table),header=1)[0]
        roy_df['Year'] = year
        roy_list.append(roy_df)

roy_data = pd.concat(roy_list)
roy_data.reset_index(drop=True)
roy_data.to_csv(f'MVP_data/roy_awards.csv')

### Extracting DPOY candidates from 2000-2023

In [45]:
# Open each HTML file, parse out the table, create pandas dataframe
dpoy_list = []
for year in years:
    with open(f'MVP_data/{year}.html', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soupTables = BeautifulSoup(''.join(soup.find_all(string=lambda text: isinstance(text, Comment) and '<table' in text)))
        soupTables.find("tr", class_="over_header").decompose()
        dpoy_table = soupTables.find('table', id="dpoy")
        dpoy_df = pd.read_html(str(dpoy_table))[0]
        dpoy_df['Year'] = year
        dpoy_list.append(dpoy_df)

dpoy_data = pd.concat(dpoy_list)
dpoy_data.reset_index(drop=True)
dpoy_data.to_csv('MVP_data/dpoy_awards.csv')

### Extracting SMOY candidates from 2000-2023

In [47]:
smoy_list = []
for year in years:
    with open(f'MVP_data/{year}.html', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soupTables = BeautifulSoup(''.join(soup.find_all(string=lambda text: isinstance(text, Comment) and '<table' in text)))
        smoy_table = soupTables.find('table', id="smoy")
        smoy_df = pd.read_html(str(smoy_table), header=1)[0]
        smoy_df['Year'] = year
        smoy_list.append(smoy_df)

smoy_data = pd.concat(smoy_list)
smoy_data.reset_index(drop=True)
smoy_data.to_csv('MVP_data/smoy_awards.csv')

### Using Selenium to extract player PPG, Advanced, and Team Record stats

In [46]:
def stats_scraping(years, url, metric=""):
    '''
    
    '''

    driver = webdriver.Chrome()
    driver.maximize_window()

    for year in years:
        driver.get(url.format(year))
        time.sleep(5)
        pagesource = driver.page_source 

        with open(f'NBA_Stats/{metric}_{year}.html', 'w', encoding="utf-8") as f:
            f.write(pagesource)
        print(f"NBA Season {year} successfully saved.")
    

### Scraping Per Game stats

In [26]:
stats_scraping(years, ppg_url, "ppg")

NBA Season 1984 successfully saved.
NBA Season 1985 successfully saved.
NBA Season 1986 successfully saved.
NBA Season 1987 successfully saved.
NBA Season 1988 successfully saved.
NBA Season 1989 successfully saved.
NBA Season 1990 successfully saved.
NBA Season 1991 successfully saved.
NBA Season 1992 successfully saved.
NBA Season 1993 successfully saved.
NBA Season 1994 successfully saved.
NBA Season 1995 successfully saved.
NBA Season 1996 successfully saved.
NBA Season 1997 successfully saved.
NBA Season 1998 successfully saved.
NBA Season 1999 successfully saved.


### Scraping Advanced stats

In [29]:
stats_scraping(years, advanced_stats_url, "advanced_stats")

NBA Season 1984 successfully saved.
NBA Season 1985 successfully saved.
NBA Season 1986 successfully saved.
NBA Season 1987 successfully saved.
NBA Season 1988 successfully saved.
NBA Season 1989 successfully saved.
NBA Season 1990 successfully saved.
NBA Season 1991 successfully saved.
NBA Season 1992 successfully saved.
NBA Season 1993 successfully saved.
NBA Season 1994 successfully saved.
NBA Season 1995 successfully saved.
NBA Season 1996 successfully saved.
NBA Season 1997 successfully saved.
NBA Season 1998 successfully saved.
NBA Season 1999 successfully saved.


### Scraping Team Standing stats

In [37]:
stats_scraping(years, standings_url, "team_standings")

NBA Season 1984 successfully saved.
NBA Season 1985 successfully saved.
NBA Season 1986 successfully saved.
NBA Season 1987 successfully saved.
NBA Season 1988 successfully saved.
NBA Season 1989 successfully saved.
NBA Season 1990 successfully saved.
NBA Season 1991 successfully saved.
NBA Season 1992 successfully saved.
NBA Season 1993 successfully saved.
NBA Season 1994 successfully saved.
NBA Season 1995 successfully saved.
NBA Season 1996 successfully saved.
NBA Season 1997 successfully saved.
NBA Season 1998 successfully saved.
NBA Season 1999 successfully saved.


In [48]:
ppg_list = []
for year in years:
    with open(f'NBA_Stats/ppg_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        for item in soup.find_all("tr", class_="thead"):
            item.decompose()
        ppg_table = soup.find(id="per_game_stats")
        ppg_df = pd.read_html(str(ppg_table))[0]
        ppg_df['Year'] = year
        ppg_list.append(ppg_df)

ppg_data = pd.concat(ppg_list)
ppg_data.reset_index(drop=True)
ppg_data.to_csv('NBA_Stats/ppg_data.csv')

In [49]:
advanced_stats_list = []
for year in years:
    with open(f'NBA_Stats/advanced_stats_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        for item in soup.find_all("tr", class_="thead"):
            item.decompose()
        advanced_stats_table = soup.find(id="advanced_stats")
        advanced_stats_df = pd.read_html(str(advanced_stats_table))[0]
        advanced_stats_df['Year'] = year
        advanced_stats_list.append(advanced_stats_df)

advanced_stats_data = pd.concat(advanced_stats_list)
advanced_stats_data.reset_index(drop=True)
advanced_stats_data.drop(columns=['Unnamed: 19','Unnamed: 24'], inplace=True)
advanced_stats_data.to_csv('NBA_Stats/advanced_stats_data.csv')

In [53]:
def rename_teams_long(col):
    if col == "Charlotte Bobcats":
        return "Charlotte Hornets"
    if col == "New Orleans Hornets" or col == "New Orleans/Oklahoma City Hornets":
        return "New Orleans Pelicans"
    if col == "New Jersey Nets":
        return "Brooklyn Nets"
    if col == "Washington Bullets":
        return "Washington Wizards"
    if col == "Kansas City Kings":
        return "Sacramento Kings"
    if col == "San Diego Clippers":
        return "Los Angeles Clippers"
    else:
        return col

In [51]:
team_conference_dict = {
 'Atlanta Hawks':'East',
 'Boston Celtics':'East',
 'Brooklyn Nets':'East',
 'Charlotte Hornets':'East',
 'Chicago Bulls':'East',
 'Cleveland Cavaliers':'East',
 'Dallas Mavericks':'West',
 'Denver Nuggets':'West',
 'Detroit Pistons':'East',
 'Golden State Warriors':'West',
 'Houston Rockets':'West',
 'Indiana Pacers':'East',
 'Los Angeles Clippers':'West',
 'Los Angeles Lakers':'West',
 'Memphis Grizzlies':'West',
 'Miami Heat':'East',
 'Milwaukee Bucks':'East',
 'Minnesota Timberwolves':'West',
 'New Orleans Pelicans':'West',
 'New York Knicks':'East',
 'Oklahoma City Thunder':'West',
 'Orlando Magic':'East',
 'Philadelphia 76ers':'East',
 'Phoenix Suns':'West',
 'Portland Trail Blazers':'West',
 'Sacramento Kings':'West',
 'San Antonio Spurs':'West',
 'Seattle SuperSonics':'West',
 'Toronto Raptors':'East',
 'Utah Jazz':'West',
 'Vancouver Grizzlies':'West',
 'Washington Wizards':'East'}

In [93]:
team_stats_list = []
for year in years:
    with open(f'NBA_Stats/team_standings_{year}.html', 'r', errors="ignore") as f:
        page = f.read()

        # Create soup object
        soup = BeautifulSoup(page, "html.parser")
        soup.find("tr", class_="over_header").decompose()
        # Find team standings data in HTML
        team_standings_table = soup.find(id="advanced-team")
        team_standings_df = pd.read_html(str(team_standings_table))[0]
        team_standings_df['Year'] = year
        
        # Clean team names and separate into east/west teams and add team seed
        team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
        team_standings_df = team_standings_df[team_standings_df['Team'] != "League Average"]
        team_standings_df['Team'] = team_standings_df['Team'].apply(rename_teams_long)
        team_standings_df['Conference'] = team_standings_df['Team'].map(team_conference_dict)
        
        east_temp = team_standings_df[team_standings_df['Conference'] == 'East'].sort_values(by='W', ascending=False)
        west_temp = team_standings_df[team_standings_df['Conference'] == 'West'].sort_values(by='W', ascending=False)
        east_temp['Seed'] = list(range(1,len(east_temp)+1))
        west_temp['Seed'] = list(range(1,len(west_temp)+1))

        team_standings_df = pd.concat([east_temp, west_temp])

        team_stats_list.append(team_standings_df)

team_standings_data = pd.concat(team_stats_list)
team_standings_data = team_standings_data[['Conference', 'Team', 'Seed', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORtg',
    'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'eFG%',
    'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1', 'DRB%',
    'FT/FGA.1', 'Year']]

team_standings_data['W'] = team_standings_data['W'].astype('int64')
team_standings_data['L'] = team_standings_data['L'].astype('int64')


team_standings_data.reset_index(drop=True, inplace=True)
team_standings_data.to_csv('NBA_Stats/team_standings.csv')



  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_standings_df['Team'] = team_standings_df['Team'].str.replace('*','')
  team_stand