In [7]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time

### Defining Year range

In [2]:
years = list(range(1991,2023))

### Using Requests to get webpages and BeautifulSoup to parse the HTML pages

#### This is for MVP data

In [None]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years: 
    url = url_start.format(year)
    data = requests.get(url)
    with open("mvp/{}.html".format(year), "w+") as f: #w+ to overwrite the files if they exist
        f.write(data.text)

#### Creating a pandas df for all the data of the MVP's from 1991 

In [4]:
dfs = []
for year in years:
    with open("safe/{}.html".format(year)) as f: # alternate folder to keep webscraped html files
        page = f.read()
    soup = BeautifulSoup(page, 'html.parser') # html parser
    soup.find('tr', class_="over_header").decompose() # removing the table row with class as over_header
    mvp_table = soup.find_all(id="mvp")[0] 
    mvp_df = pd.read_html(str(mvp_table))[0] # converting the html mvp_table to string to put it in pandas readable format
    mvp_df["Year"] = year # know the year after concatenation i.e. from which webpage we got that piece of data
    dfs.append(mvp_df)

In [14]:
mvps =  pd.concat(dfs) # putting all the MVP data into a single csv
mvps.tail()
mvps.to_csv("mvps.csv")

### Webscraping using Selenium

#### This is the data for all the players in the NBA and we will use this to see the difference b/w MVP and regular players

##### We need selenium for this as using requests to get the webpage will only load a few lines of the data as it uses JS to load the webpage as the user scrolls so we set a scroller using selenium on a created chrome browser to get all the data

In [9]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html" # the webpage for all per game stats of players


In [18]:
driver = webdriver.Chrome(executable_path = r"C:\Users\dixit\Documents\college stuff\2nd year\Project\selenium chrome driver") # creates a new chrome browser

  driver = webdriver.Chrome(


In [51]:
for year in years:
    url = player_stats_url.format(year) 

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)") # telling the driver to scroll through the website
    time.sleep(2) # setting up a sleep timer to not get a timeout from the website 

    with open("player/{}.html".format(year), "w+") as f:
        f.write(driver.page_source)

In [52]:
dfs = []
for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose() # removing the table rows with the class = "thead" as they are unnecessary for our purposes
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

In [53]:
players = pd.concat(dfs)

In [54]:
players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [74]:
players.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
836,601,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022
837,602,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022
838,603,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022
839,604,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022
840,605,Ivica Zubac,C,24,LAC,76,76,24.4,4.1,6.5,...,2.9,5.6,8.5,1.6,0.5,1.0,1.5,2.7,10.3,2022


In [55]:
players.to_csv("players.csv")

#### Using BS4 and requests to get the team and their records

##### The record of the team is considered to be a huge factor while deciding the MVP. We will be using division standings to get the team record and other data for our analysis

In [57]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [68]:
year = 2022
url = team_stats_url.format(year)

data = requests.get(url)

with open("team/{}.html".format(year), "w+") as f:
    f.write(data.text)

#### Since there are 2 different table on each webpage for eastern and western confrence we will parse both the tables seprately and then append them into the final dataframe

In [76]:
dfs = []
for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose() # decompose the table row where class = "thead"
    e_table = soup.find_all(id="divs_standings_E")[0] # finding the table with the given name -> note that this is the table for eastern conference only
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"] # note that the team name is stored under the heading eastern confrence so we change that to team
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

#### Combining the team stats

In [70]:
teams = pd.concat(dfs)

In [71]:
teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
13,56,26,0.683,—,115.6,109.9,5.37,2022,Memphis Grizzlies*
14,52,30,0.634,4.0,108.0,104.7,3.12,2022,Dallas Mavericks*
15,36,46,0.439,20.0,109.3,110.3,-0.84,2022,New Orleans Pelicans*
16,34,48,0.415,22.0,113.2,113.0,0.02,2022,San Antonio Spurs
17,20,62,0.244,36.0,109.7,118.2,-8.26,2022,Houston Rockets


#### * in the above table are teams which went to the playoffs

In [72]:
teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [73]:
teams.to_csv("teams.csv")