# Predicting NBA MVP - Webscraping

I want to use the machine learning to predict the MVP for the NBA.

First we'll scrape the mvp, player, and team stats from the website basketball-reference.com. Each year has its own webpage, so we'll need to automate the downloading of each year's information, parse the information from each page, then concatenate the pages into one csv file. That file gets saved for later use.

# Import all the things

In [1]:
import pandas as pd

import time
import requests
import os
#import shutil

from bs4 import BeautifulSoup
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service

# Get all the statistics

## Get the MVP data from 1991-2021

In [2]:
# list of years
years = list(range(1991, 2021))

In [3]:
# url for the MVP stats follows this form - the year goes between the brackets
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [4]:
# loop through all years and save the html page locally to parse later
# you may need to create an mvp folder first!!

for year in years:
    url = url_start.format(year)
    
    data = requests.get(url)
    
    with open("mvp/{}.html".format(year), "w+", errors = 'replace') as f:
        f.write(data.text)

In [5]:

# test first html file (1991) to see if it parsed correctly
with open("mvp/1991.html") as f:
    page = f.read()
    
soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="over_header").decompose()

In [7]:
mvp_table = soup.find_all(id="mvp")[0]

In [8]:
mvp_1991 = pd.read_html(str(mvp_table))[0]
mvp_1991.head(1)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321


In [9]:
mvp_1991["Year"] = 1991
mvp_1991.head(1)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991


In [10]:
# everything parses correctly, so create a loop to process every html file in the mvp folder

dfs = []
for year in years:
    with open("mvp/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    dfs.append(mvp_df)

In [11]:
# merge the years together

mvps = pd.concat(dfs)
mvps.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
7,8,Damian Lillard,29,POR,0.0,23.0,1010,0.023,66,37.5,...,4.3,8.0,1.1,0.3,0.463,0.401,0.888,11.6,0.225,2020
8,9,Nikola Joki?,24,DEN,0.0,18.0,1010,0.018,73,32.0,...,9.7,7.0,1.2,0.6,0.528,0.314,0.817,9.8,0.202,2020
9,10,Pascal Siakam,25,TOR,0.0,17.0,1010,0.017,60,35.2,...,7.3,3.5,1.0,0.9,0.453,0.359,0.792,5.4,0.123,2020
10,11,Jimmy Butler,30,MIA,0.0,9.0,1010,0.009,58,33.8,...,6.7,6.0,1.8,0.6,0.455,0.244,0.834,9.0,0.221,2020
11,12,Jayson Tatum,21,BOS,0.0,1.0,1010,0.001,66,34.3,...,7.0,3.0,1.4,0.9,0.45,0.403,0.812,6.9,0.146,2020


In [12]:
# save to csv file
mvps.to_csv("mvps.csv")

## Get the player stats for 1991-2021

In [13]:
# all the urls follow this form - the years go inside the brackets
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [14]:
# loop through all years and save the html page locally to parse later
# you may need to create a player folder first!!

for year in years:
    url = player_stats_url.format(year)
    
    data = requests.get(url)
    
    with open("player/{}.html".format(year), "w+", errors = 'replace') as f:
        f.write(data.text)

In [15]:
# set a variable for the Edge webdriver location
# the sites with the player information only gives part of the information
# so we need to automate the javascript to get the rest

service = Service(verbose = True)
driver = webdriver.Edge(service = service)

In [16]:
#loop through all the html files and parse them

for year in years:
    url = player_stats_url.format(year)
    
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    with open("player/{}.html".format(year), "w+", errors = 'replace') as f:
        f.write(driver.page_source)

In [20]:
# combine all the tables into a list of dataframes
# dfs[0] would be the stats table for 1991

dfs = []
for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

In [21]:
# creates a dataframe from the list of dataframes
# concatenating them all together at axis = 0

players = pd.concat(dfs)
players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [22]:
# save file to csv
players.to_csv("players.csv")

## Get the team stats for 1991-2021

Conventional wisdom says that mvps will come from winning teams more than losing ones. I'll pull the team stats into the data to see if this is only a myth or if there's really something to it.

In [23]:
# the team stats follows this form - the years go inside the bracket

team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [24]:
# loop through and create html files for each year

for year in years:
    url = team_stats_url.format(year)
    
    data = requests.get(url)
    
    with open("team/{}.html".format(year), "w+", errors = 'replace') as f:
        f.write(data.text)

In [25]:
# combine all the tables into a list of dataframes
# dfs[0] would be the stats table for 1991

dfs = []
for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")[0]
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

In [26]:
# creates a dataframe from the list of dataframes
# concatenating them all together at axis = 0

teams = pd.concat(dfs)
teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
13,44,28,0.611,—,117.8,114.8,3.13,2020,Houston Rockets*
14,43,32,0.573,2.5,117.0,112.1,4.87,2020,Dallas Mavericks*
15,34,39,0.466,10.5,112.6,113.7,-0.91,2020,Memphis Grizzlies
16,32,39,0.451,11.5,114.1,115.2,-0.65,2020,San Antonio Spurs
17,30,42,0.417,14.0,115.8,117.1,-0.55,2020,New Orleans Pelicans


In [27]:
# save the file to csv
teams.to_csv("teams.csv")