## Scraping NBA MVPs from Basketball-Reference

In [None]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

Creating list of ranges from 1991 - 2021
<br /> Needs to be seperated for request volume limit of 30

In [None]:
years = list(range(1991, 2010))
years2 = list(range(2010, 2022))

year_all = list(range(1991, 2022))

Getting url for website to scrap, using {} so it can be looped

In [None]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

Looping over both ranges once to get data stored in a folder using requests and writes

In [None]:
for year in years2:
  url = url_start.format(year) #replaces {} with the year in loop
  data = requests.get(url) #getting html data from the page

  with open("mvp/{}.html".format(year), "w+") as f: #creates a file in the folder then is given writing permission
    f.write(data.text) #writes html into file

Looping every table and storing them in a dataframes after parsing and finding the tables. Add a year column to keep track of which table it is

In [None]:
mvp_all = [] #initialises dataframe
for year in year_all:
  with open("mvp/{}.html".format(year)) as f: #looks at file but only for reading (by default)
    page = f.read()
  soup = BeautifulSoup(page, "html.parser") #using BeautifulSoup to parse html
  soup.find("tr", class_="over_header").decompose() #removing the header of the table
  mvp_table = soup.find(id="mvp") #finds the mvp table and puts it in a list
  mvp = pd.read_html(str(mvp_table))[0] #Get the first table in the list of dataframes
  mvp["Year"] = year

  mvp_all.append(mvp)

In [None]:
mvps = pd.concat(mvp_all)

In [None]:
mvps.to_csv("mvps.csv")

Selenium does not work with google colab, so code will work only locally or with Jupyter

Simple selenium use of opening web page up first to execute all javascript then take the pages source

In [None]:
"""
driver = webdriver.Chrome() #Starts driver

player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

for year in year_all:
  url = player_stats_url.format(year)

  driver.get(url) #driver opens page
  driver.execute_script("window.scrollTo(1,1000)") #scrolls all the way down to execute all javascript

  time.sleep(2)
  html = driver.page_source #Gets all the tables instead of only a few
  with open("player/{}.html".format(year), "w+") as f:
    f.write(html)
"""

Remove the headers in-between and append into dataframe, same as before

In [None]:
"""
df = [] #initialises dataframe
for year in year_all:
  with open("player/{}.html".format(year)) as f:
    page = f.read()
  soup = BeautifulSoup(page, "html.parser") #using BeautifulSoup to parse html
  soup.find("tr", class_="thead").decompose() #removing the header of the table
  player_table = soup.find(id="per_game_stats") #finds the mvp table and puts it in a list
  player = pd.read_html(str(player_table))[0] #Get the first table in the list of dataframes
  player["Year"] = year

  df.append(player)
"""

In [None]:
"""
players = pd.concat(df)
players.to_csv("players.csv")
"""

In [None]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years2:
  url = team_stats_url.format(year)

  data = requests.get(url)

  with open("team/{}.html".format(year), "w+") as f:
      f.write(data.text)

In [None]:
df = []
for year in year_all:
  with open("team/{}.html".format(year)) as f:
    page = f.read()
  soup = BeautifulSoup(page, "html.parser")
  soup.find("tr", class_="thead").decompose()
  team_table = soup.find_all(id="divs_standings_E")[0]
  team = pd.read_html(str(team_table))[0]
  team["Year"] = year
  team["Team"] = team["Eastern Conference"]
  del team["Eastern Conference"]

  df.append(team)

  team_table = soup.find_all(id="divs_standings_W") [0]
  team = pd.read_html(str(team_table))[0]
  team["Year"] = year
  team["Team"] = team["Western Conference"]
  del team["Western Conference"]

  df.append(team)

In [None]:
teams = pd.concat(df)

In [None]:
teams.to_csv("teams.csv")