# 1. Introduction
The aim of this notebook is to scrape NBA player stats data to predict MVP. <br>
Adapted from [Dataquest's](https://www.youtube.com/watch?v=JGQGd-oa0l4) YouTube video. <br>

This is a three part project:
* Dataset: [1991-2021 NBA Stats](https://www.kaggle.com/datasets/vivovinco/19912021-nba-stats) <br>
* First part: [NBA Stats: Web Scraping](https://www.kaggle.com/code/vivovinco/nba-stats-web-scraping) <br>
* Second part: [NBA Stats: Data Cleaning](https://www.kaggle.com/code/vivovinco/nba-stats-data-cleaning) <br>
* Third part: [NBA Stats: MVP Prediction](https://www.kaggle.com/code/vivovinco/nba-stats-mvp-prediction) <br>

**If you're reading this, please upvote.**

In [None]:
!pip install selenium
!pip install chromedriver-py==94.0.4606.41
!pip install requests

# libraries
import os
import pandas as pd
import shutil
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup as bs
from chromedriver_py import binary_path
import requests
import warnings; warnings.filterwarnings("ignore")

# unhide all rows and columns
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

# 2. Scrape MVP List

In [None]:
years = list(range(1991,2022))
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [None]:
dfs = []

for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    
    with open("./{}.html".format(year), "w+") as f:
        f.write(data.text)
    
    with open("./{}.html".format(year)) as f:
        page = f.read()
        
    soup = bs(page, "html.parser")
    soup.find("tr", class_ = "over_header").decompose()
    mvp_table = soup.find_all(id = "mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)
    mvps = pd.concat(dfs)

In [None]:
mvps

In [None]:
mvps.to_csv("mvps.csv")

# 3. Scrape Player Stats

In [None]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [None]:
driver = webdriver.Chrome(executable_path="/Users/.../chromedriver")

for year in years:
    url = player_stats_url.format(year)
    
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    with open("player/{}.html".format(year), "w+") as f:
        f.write(driver.page_source)

In [None]:
dfs = []

for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

In [None]:
players = pd.concat(dfs)
players

In [None]:
players.to_csv("players.csv")

In [None]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [None]:
for year in years:
    url = team_stats_url.format(year)
    
    data = requests.get(url)
    
    with open("team/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [None]:
dfs = []
for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")[0]
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

In [None]:
teams = pd.concat(dfs)
teams

In [None]:
teams.to_csv("teams.csv")