In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## 1. MVP Stats

In [2]:
# Testing on Single Page

# url_base_MVP = r"https://www.basketball-reference.com/awards/awards_{}.html"
# year = 1991

# url_MVP = url_base_MVP.format(year)
# data = requests.get(url_MVP)
# data.text

### (a)  Downloading the MVP Html Pages

In [3]:
years = list(range(1991,2023))
base_url_MVP = r"https://www.basketball-reference.com/awards/awards_{}.html"

In [4]:
def download_MVP_HtmlPage(Years_arr , base_url):
    
    """
    Downloads the MVPs Html Pages corresponding to each year mentioned in the Years_arr.
    
    """
     
    for year in Years_arr:
        
        url_MVP = base_url.format(year)
        data = requests.get(url_MVP) 

        with open("mvp/{}.html".format(year) ,"w+",encoding='utf-8') as f:
            f.write(data.text)

In [5]:
download_MVP_HtmlPage(Years_arr = years, base_url = base_url_MVP)

### (b) Scrapping MVP Html Pages

In [6]:
# Testing on single page

# year = 1991

# with open("mvp/{}.html".format(year) , "r",encoding='utf-8') as f:
    
#     page = f.read()


# soup = BeautifulSoup(page , "html.parser")

# # tr element : table row

# soup.find("tr" ,class_ = "over_header").decompose()

# mvp_table = soup.find(id = "mvp")

# #type(mvp_table)

# # Returned a list containing the DF
# mvp_1991 = pd.read_html(str(mvp_table))  

# mvp_1991

# #type(mvp_1991)
# # Extracting the first element of the list which is the Required DF.
# mvp_1991[0]     

# #type(mvp_1991[0])

In [7]:
def Scrap_MVP_HtmlPage(years_arr ):
    
    """
    Returns the combined MVPs DataFrame corresponding to the all the years 
    mentioned in the years_arr.
    
    """
    dfs = []

    for year in years_arr :

        with open("mvp/{}.html".format(year) , "r",encoding='utf-8') as f:

            page = f.read()

        # Allternate way : soup = BeautifulSoup(page , "lxml")
        soup = BeautifulSoup(page , "html.parser")

        # tr element : table row
        soup.find("tr" ,class_ = "over_header").decompose()
        # table element
        mvp_table = soup.find(id = "mvp")  

        mvp = pd.read_html(str(mvp_table))[0]
        mvp['Year'] = year

        dfs.append(mvp)

    mvps = pd.concat(dfs)
    mvps.sort_values(by = ["Year"] , ascending = True , inplace = True)
    mvps.reset_index(drop = True , inplace = True)
    return mvps

In [8]:
MVPs = Scrap_MVP_HtmlPage(years_arr = years)

In [9]:
MVPs.shape

(486, 21)

In [10]:
MVPs.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,19T,Kevin McHale,33,BOS,0.0,1.0,960,0.001,68,30.4,...,7.1,1.9,0.4,2.1,0.553,0.405,0.829,7.9,0.182,1991
2,19T,Tim Hardaway,24,GSW,0.0,1.0,960,0.001,82,39.2,...,4.0,9.7,2.6,0.1,0.476,0.385,0.803,9.9,0.148,1991
3,18,Hakeem Olajuwon,28,HOU,0.0,4.0,960,0.004,56,36.8,...,13.8,2.3,2.2,3.9,0.508,0.0,0.769,8.6,0.201,1991
4,17,Kenny Smith,25,HOU,0.0,5.0,960,0.005,78,34.6,...,2.1,7.1,1.4,0.1,0.52,0.363,0.844,9.0,0.161,1991


In [11]:
MVPs.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
481,2,Joel Embiid,27,PHI,26.0,706.0,1000,0.706,68,33.8,...,11.7,4.2,1.1,1.5,0.499,0.371,0.814,12.0,0.252,2022
482,1,Nikola Jokić,26,DEN,65.0,875.0,1000,0.875,74,33.5,...,13.8,7.9,1.5,0.9,0.583,0.337,0.81,15.2,0.296,2022
483,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,37.2,...,7.4,6.4,0.9,0.9,0.518,0.383,0.91,8.4,0.198,2022
484,5,Luka Dončić,22,DAL,0.0,146.0,1000,0.146,65,35.4,...,9.1,8.7,1.2,0.6,0.457,0.353,0.744,7.6,0.159,2022
485,10T,LeBron James,37,LAL,0.0,1.0,1000,0.001,56,37.2,...,8.2,6.2,1.3,1.1,0.524,0.359,0.756,7.5,0.172,2022


In [12]:
MVPs['Year'].unique()

array([1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
      dtype=int64)

## 2. Players Stats

In [13]:
from selenium import webdriver

In [14]:
import time

### (a) Downloading the Player Html Pages

In [15]:
base_url_Player = r"https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
driver = webdriver.Chrome()

In [16]:
def download_Player_HtmlPage(Years_arr , base_url):
    
    """
    Downloads the Player Html Pages corresponding to each year mentioned in the Years_arr.
    
    """
    
    for year in Years_arr:
    
        url_player = base_url.format(year)

        driver.get(url_player)
        driver.execute_script("window.scrollTo(1,10000)")

        time.sleep(4)

        html = driver.page_source

        with open("player/{}.html".format(year) ,"w+",encoding='utf-8') as f:
            f.write(html)


In [17]:
download_Player_HtmlPage(Years_arr = years, base_url = base_url_Player)

### (b) Scrapping the Player html Pages

In [18]:
def Scrap_Player_HtmlPage(years_arr):
    
    """
    Returns the combined Players DataFrame corresponding to the all the years 
    mentioned in the years_arr.
    
    """
    dfs = []

    for year in years_arr :

        with open("player/{}.html".format(year) , "r" ,encoding='utf-8' ) as f :

            page = f.read()

        # Allternate way : soup = BeautifulSoup(page , "lxml")
        soup = BeautifulSoup(page , "html.parser")

        # tr element : table row
        soup.find("tr" ,class_ = "thead").decompose()

        # table element
        player_table = soup.find(id = "per_game_stats")  

        player = pd.read_html(str(player_table))[0]  

        player['Year'] = year

        dfs.append(player)

    Players_DF = pd.concat(dfs)
    Players_DF.sort_values(by = "Year" , ascending = True , inplace = True)
    Players_DF.reset_index(drop = True ,inplace = True)
    
    return Players_DF


In [19]:
Players = Scrap_Player_HtmlPage(years_arr = years)

In [20]:
Players.shape

(18885, 31)

In [21]:
Players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,264,Dražen Petrović*,SG,26,POR,18,0,7.4,1.8,3.9,...,0.6,0.4,1.0,1.1,0.3,0.0,0.7,1.2,4.4,1991
2,264,Dražen Petrović*,SG,26,TOT,61,0,16.6,4.0,8.1,...,0.8,1.0,1.8,1.4,0.7,0.0,1.3,2.2,10.2,1991
3,263,Jim Petersen,C,28,GSW,62,21,13.5,1.8,3.8,...,1.1,2.1,3.2,0.4,0.2,0.7,0.8,2.5,4.5,1991
4,262,Chuck Person,SF,26,IND,80,79,32.1,7.8,15.4,...,1.5,3.7,5.2,3.0,0.7,0.2,2.3,2.8,18.4,1991


In [22]:
Players.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
18880,213,Rui Hachimura,PF,23,WAS,42,13,22.5,4.5,9.1,...,0.6,3.2,3.8,1.1,0.5,0.2,0.8,1.3,11.3,2022
18881,214,Tyrese Haliburton,SG-PG,21,TOT,77,77,35.0,5.6,11.8,...,0.8,3.2,4.0,8.2,1.7,0.6,2.6,1.6,15.3,2022
18882,214,Tyrese Haliburton,SG,21,SAC,51,51,34.5,5.3,11.5,...,0.8,3.1,3.9,7.4,1.7,0.7,2.3,1.4,14.3,2022
18883,192,Shai Gilgeous-Alexander,PG,23,OKC,56,56,34.7,8.5,18.8,...,0.7,4.3,5.0,5.9,1.3,0.8,2.8,2.5,24.5,2022
18884,605,Ivica Zubac,C,24,LAC,76,76,24.4,4.1,6.5,...,2.9,5.6,8.5,1.6,0.5,1.0,1.5,2.7,10.3,2022


In [23]:
Players['Year'].unique()

array([1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
      dtype=int64)

## 3. Teams Stats

### (a) Downloading Team Html Pages

In [24]:
base_url_team =  r"https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [25]:
def download_Team_HtmlPage(Years_arr , base_url):
    
    for year in Years_arr :
    
        url_teams = base_url.format(year)
        data = requests.get(url_teams) 

        with open("team/{}.html".format(year) ,"w+",encoding='utf-8') as f:
            f.write(data.text)

In [26]:
download_Team_HtmlPage(Years_arr = years , base_url = base_url_team)

### (b) Scrapping Team Html Pages

In [27]:
def Scrap_Team_HtmlPage(years_arr):
    
    """
    Returns the combined Teams DataFrame corresponding to the all the years 
    mentioned in the years_arr.
    
    """
    dfs = []

    for year in years :

        with open("team/{}.html".format(year) , "r" ,encoding='utf-8' ) as f :

            page = f.read()

        # Allternate way : soup = BeautifulSoup(page , "lxml")
        soup = BeautifulSoup(page , "html.parser")

        # tr element : table row
        soup.find("tr" ,class_ = "thead").decompose()

        # table element - for Eastern Conference 
        team_table = soup.find(id = "divs_standings_E")  

        team = pd.read_html(str(team_table))[0]  

        team['Year'] = year 
        team['Team'] = team['Eastern Conference']
        
        del team['Eastern Conference']

        dfs.append(team)

        team_table = soup.find(id = "divs_standings_W")  
        
        team = pd.read_html(str(team_table))[0]  

        team['Year'] = year 
        team['Team'] = team['Western Conference']

        del team['Western Conference']
        
        dfs.append(team)
    
    teams_df = pd.concat(dfs)
    
    teams_df.sort_values(by = "Year" , ascending = True , inplace =True)
    teams_df.reset_index(drop =True , inplace = True)
    
    return teams_df


In [28]:
Teams = Scrap_Team_HtmlPage(years_arr = years)

In [29]:
Teams.shape

(1068, 9)

In [30]:
Teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,25,57,0.305,38.0,96.7,103.5,-6.27,1991,Sacramento Kings
2,31,51,0.378,32.0,103.5,107.0,-3.16,1991,Los Angeles Clippers
3,41,41,0.5,22.0,106.6,105.4,1.31,1991,Seattle SuperSonics*
4,44,38,0.537,19.0,116.6,115.0,1.72,1991,Golden State Warriors*


In [32]:
Teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
1063,53,29,0.646,—,110.0,105.6,4.23,2022,Miami Heat*
1064,43,39,0.524,10.0,113.9,112.4,1.55,2022,Atlanta Hawks*
1065,43,39,0.524,10.0,115.3,114.9,0.53,2022,Charlotte Hornets
1066,22,60,0.268,31.0,104.2,112.2,-7.67,2022,Orlando Magic
1067,20,62,0.244,36.0,109.7,118.2,-8.26,2022,Houston Rockets


## 4. Export DF as csv Files

In [33]:
MVPs.to_csv("mvps.csv")
Players.to_csv("players.csv")
Teams.to_csv("team.csv")