# Part 1: Web-Scraping

### Scraping MVP Data

In [18]:
years = list(range(2000,2025))

In [19]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [20]:
import requests

for year in years:
    url = url_start.format(year)
    data = requests.get(url)

    with open("MVP/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [35]:
from bs4 import BeautifulSoup
import pandas as pd

df = []

for year in years:
        with open("MVP/{}.html".format(year)) as f:
                page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        soup.find('div', class_="over_header").decompose()
        mvp_table = soup.find(id="mvp")
        mvp = pd.read_html(str(mvp_table))[0]
        mvp["Year"] = year

        df.append(mvp)

In [22]:
df

[   Rank             Player  Age   Tm  First  Pts Won  Pts Max  Share   G  \
 0     1   Shaquille O'Neal   27  LAL    120     1207     1210  0.998  79   
 1     2      Kevin Garnett   23  MIN      0      408     1210  0.337  81   
 2     3    Alonzo Mourning   29  MIA      0      367     1210  0.303  79   
 3     4        Karl Malone   36  UTA      0      312     1210  0.258  82   
 4     5         Tim Duncan   23  SAS      0      248     1210  0.205  74   
 5     6        Gary Payton   31  SEA      0      180     1210  0.149  82   
 6     7      Allen Iverson   24  PHI      1      132     1210  0.109  70   
 7     8         Grant Hill   27  DET      0      113     1210  0.093  74   
 8     9       Chris Webber   26  SAC      0       96     1210  0.079  75   
 9    10       Vince Carter   23  TOR      0       51     1210  0.042  82   
 10   11         Jason Kidd   26  PHO      0       25     1210  0.021  67   
 11   12        Kobe Bryant   21  LAL      0        3     1210  0.002  66   

In [24]:
mvp_df = pd.concat(df)
mvp_df

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Shaquille O'Neal,27,LAL,120,1207,1210,0.998,79,40.0,...,13.6,3.8,0.5,3.0,0.574,0.000,0.524,18.6,0.283,2000
1,2,Kevin Garnett,23,MIN,0,408,1210,0.337,81,40.0,...,11.8,5.0,1.5,1.6,0.497,0.370,0.765,11.6,0.172,2000
2,3,Alonzo Mourning,29,MIA,0,367,1210,0.303,79,34.8,...,9.5,1.6,0.5,3.7,0.551,0.000,0.711,12.9,0.226,2000
3,4,Karl Malone,36,UTA,0,312,1210,0.258,82,35.9,...,9.5,3.7,1.0,0.9,0.509,0.250,0.797,15.3,0.249,2000
4,5,Tim Duncan,23,SAS,0,248,1210,0.205,74,38.9,...,12.4,3.2,0.9,2.2,0.490,0.091,0.761,13.0,0.218,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,5,Jalen Brunson,27,NYK,0,142,990,0.143,77,35.4,...,3.6,6.7,0.9,0.2,0.479,0.401,0.847,11.2,0.198,2024
5,6,Jayson Tatum,25,BOS,0,86,990,0.087,74,35.7,...,8.1,4.9,1.0,0.6,0.471,0.376,0.833,10.4,0.189,2024
6,7,Anthony Edwards,22,MIN,0,18,990,0.018,79,35.1,...,5.4,5.1,1.3,0.5,0.461,0.357,0.836,7.5,0.130,2024
7,8,Domantas Sabonis,27,SAC,0,3,990,0.003,82,35.7,...,13.7,8.2,0.9,0.6,0.594,0.379,0.704,12.6,0.206,2024


In [27]:
mvp_df.to_csv("MVP.csv")

### Scraping Player Statistics

In [32]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [33]:
for year in years:
    url = player_stats_url.format(year)
    data = requests.get(url)

    with open("Player Stats/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [46]:
df = []

for year in years:
        with open("Player Stats/{}.html".format(year)) as f:
                page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        soup.find('tr', class_="thead").decompose()
        player_stats_table = soup.find(id="div_per_game_stats")
        player_stats = pd.read_html(str(player_stats_table))[0]
        player_stats["Year"] = year

        df.append(player_stats)

In [47]:
df

[      Rk               Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  ORB  \
 0      1    Tariq Abdul-Wahad  SG  25  TOT  61  56  25.9  4.5  10.6  ...  1.7   
 1      1    Tariq Abdul-Wahad  SG  25  ORL  46  46  26.2  4.8  11.2  ...  1.7   
 2      1    Tariq Abdul-Wahad  SG  25  DEN  15  10  24.9  3.4   8.7  ...  1.6   
 3      2  Shareef Abdur-Rahim  SF  23  VAN  82  82  39.3  7.2  15.6  ...  2.7   
 4      3       Cory Alexander  PG  26  DEN  29   2  11.3  1.0   3.4  ...  0.3   
 ..   ...                  ...  ..  ..  ...  ..  ..   ...  ...   ...  ...  ...   
 511  436     Haywoode Workman  PG  34  MIL  23   1  10.8  1.0   2.7  ...  0.0   
 512  436     Haywoode Workman  PG  34  TOR  13   1   7.8  0.6   2.2  ...  0.0   
 513  437    Metta World Peace  SF  20  CHI  72  63  31.1  4.3  10.5  ...  0.9   
 514  438      Lorenzen Wright   C  24  ATL  75   0  16.1  2.4   4.8  ...  1.6   
 515  439            Tim Young   C  23  GSW  25   0   5.5  0.5   1.6  ...  0.5   
 
      DRB   TR

In [48]:
player_stat_df = pd.concat(df)
player_stat_df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Tariq Abdul-Wahad,SG,25,TOT,61,56,25.9,4.5,10.6,...,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4,2000
1,1,Tariq Abdul-Wahad,SG,25,ORL,46,46,26.2,4.8,11.2,...,1.7,3.5,5.2,1.6,1.2,0.3,1.9,2.5,12.2,2000
2,1,Tariq Abdul-Wahad,SG,25,DEN,15,10,24.9,3.4,8.7,...,1.6,1.9,3.5,1.7,0.4,0.8,1.3,2.1,8.9,2000
3,2,Shareef Abdur-Rahim,SF,23,VAN,82,82,39.3,7.2,15.6,...,2.7,7.4,10.1,3.3,1.1,1.1,3.0,3.0,20.3,2000
4,3,Cory Alexander,PG,26,DEN,29,2,11.3,1.0,3.4,...,0.3,1.2,1.4,2.0,0.8,0.1,1.0,1.3,2.8,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,568,Thaddeus Young,PF,35,PHO,10,0,8.9,1.1,2.1,...,1.7,1.1,2.8,0.7,0.5,0.2,0.4,1.1,2.3,2024
758,569,Trae Young,PG,25,ATL,54,54,36.0,8.0,18.7,...,0.4,2.3,2.8,10.8,1.3,0.2,4.4,2.0,25.7,2024
759,570,Omer Yurtseven,C,25,UTA,48,12,11.4,2.1,3.8,...,1.5,2.8,4.3,0.6,0.2,0.4,0.8,1.1,4.6,2024
760,571,Cody Zeller,C,31,NOP,43,0,7.4,0.6,1.4,...,1.1,1.5,2.6,0.9,0.2,0.1,0.4,1.0,1.8,2024


In [50]:
player_stat_df.to_csv("Player_Stat.csv")

### Scraping Team Data

#

In [51]:
teams_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [52]:
for year in years:
    url = teams_url.format(year)
    data = requests.get(url)

    with open("Teams/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [54]:
df = []

for year in years:
        with open("Teams/{}.html".format(year)) as f:
                page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        soup.find('tr', class_="thead").decompose()
        team_standings_table = soup.find(id="divs_standings_E")
        team_standings = pd.read_html(str(team_standings_table))[0]
        team_standings["Year"] = year
        team_standings["Team"] = team_standings["Eastern Conference"]
        
        del team_standings["Eastern Conference"]
        df.append(team_standings)

        
        soup = BeautifulSoup(page, "html.parser")
        soup.find('tr', class_="thead").decompose()
        team_standings_table = soup.find(id="divs_standings_W")
        team_standings = pd.read_html(str(team_standings_table))[0]
        team_standings["Year"] = year
        team_standings["Team"] = team_standings["Western Conference"]

        del team_standings["Western Conference"]
        df.append(team_standings)

In [55]:
df

[                   W                 L              W/L%                GB  \
 0                 52                30              .634                 —   
 1                 50                32              .610               2.0   
 2                 49                33              .598               3.0   
 3                 41                41              .500              11.0   
 4                 35                47              .427              17.0   
 5                 31                51              .378              21.0   
 6                 29                53              .354              23.0   
 7   Central Division  Central Division  Central Division  Central Division   
 8                 56                26              .683                 —   
 9                 49                33              .598               7.0   
 10                45                37              .549              11.0   
 11                42                40             

In [56]:
teams_df = pd.concat(df)
teams_df

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,52,30,.634,—,94.4,91.3,2.75,2000,Miami Heat*
1,50,32,.610,2.0,92.1,90.7,1.30,2000,New York Knicks*
2,49,33,.598,3.0,94.8,93.4,1.02,2000,Philadelphia 76ers*
3,41,41,.500,11.0,100.1,99.4,0.43,2000,Orlando Magic
4,35,47,.427,17.0,99.3,100.1,-1.00,2000,Boston Celtics
...,...,...,...,...,...,...,...,...,...
13,50,32,.610,—,117.9,115.6,2.30,2024,Dallas Mavericks* (5)
14,49,33,.598,1.0,115.1,110.7,4.46,2024,New Orleans Pelicans* (7)
15,41,41,.500,9.0,114.3,113.2,1.24,2024,Houston Rockets (11)
16,27,55,.329,23.0,105.8,112.8,-6.57,2024,Memphis Grizzlies (13)


In [57]:
teams_df.to_csv("Teams.csv")