In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import pandas as pd

In [7]:
first_year = 1981
last_year = 2024

In [8]:
def scrap_mvps(first_year=1981, last_year=2024):
    all_rows = []
    for year in range(first_year, last_year + 1):
        url = f"https://www.basketball-reference.com/awards/awards_{year}.html"
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")
        table = soup.find("table")
        headers = [th.getText() for th in table.findAll("tr", limit=2)[1].findAll("th")]
        headers[0] = "Year"
        rows = table.findAll('tr')[2:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                        for i in range(len(rows))]
        for i in range(len(rows_data)):
            rows_data[i].insert(0, year)
        all_rows.extend(rows_data)
        print("Season", year, "done")
        time.sleep(5)
    mvps = pd.DataFrame(all_rows, columns = headers)
    mvps.to_csv(f"../data/raw/{first_year}_{last_year}_mvps.csv", index=False)

def scrap_standings(first_year=1981, last_year=2024):
    all_rows = pd.DataFrame()
    for year in range(first_year, last_year + 1):
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html#expanded_standings"
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")
        tables = soup.findAll("table")
        east_table = tables[0]
        west_table = tables[1]
        east_data, headers = scrap_conference(east_table, year)
        west_data, _ = scrap_conference(west_table, year)
        east = pd.DataFrame(east_data, columns = headers)
        west = pd.DataFrame(west_data, columns = headers)
        league = pd.concat([east, west])
        league = league.sort_values(by=["W/L%"], ascending=False)
        league["seed"] = range(1, len(league) + 1)
        all_rows = pd.concat([all_rows, league])
        print("Season", year, "done")
        time.sleep(5)
    all_rows.to_csv(f"../data/raw/{first_year}_{last_year}_standings.csv", index=False)
    
def scrap_conference(table, year):
    headers = [th.getText() for th in table.findAll("tr", limit=2)[0].findAll("th")]
    headers[0] = "Year"
    headers.insert(1, "Team")
    rows = table.findAll('tr')[1:]
    rows_data = [[td.getText() for td in rows[i].findAll('td')]
                 for i in range(len(rows))]
    teams = [[td.getText() for td in rows[i].findAll('th')]
             for i in range(len(rows))]
    for i in range(len(rows_data)):
            rows_data[i].insert(0, year)
            rows_data[i].insert(1, teams[i][0].replace("*", ""))
    return rows_data, headers

def scrap_advanced(first_year=1981, last_year=2024):
    all_rows = []
    for year in range(first_year, last_year + 1):
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")
        table = soup.find("table")
        headers = [th.getText() for th in table.findAll("tr", limit=2)[0].findAll("th")]
        headers[0] = "Year"
        rows = table.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                        for i in range(len(rows))]
        for i in range(len(rows_data)):
            rows_data[i].insert(0, year)
        all_rows.extend(rows_data)
        print("Season", year, "done")
        time.sleep(5)
    advanced = pd.DataFrame(all_rows, columns = headers)
    advanced.to_csv(f"../data/raw/{first_year}_{last_year}_advanced.csv", index=False)
    
def scrap_per_game(first_year=1981, last_year=2024):
    all_rows = []
    for year in range(first_year, last_year + 1):
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")
        table = soup.find("table")
        headers = [th.getText() for th in table.findAll("tr", limit=2)[0].findAll("th")]
        headers[0] = "Year"
        rows = table.findAll('tr')[1:]
        rows_data = [[td.getText() for td in rows[i].findAll('td')]
                        for i in range(len(rows))]
        for i in range(len(rows_data)):
            rows_data[i].insert(0, year)
        all_rows.extend(rows_data)
        print("Season", year, "done")
        time.sleep(5)
    per_game = pd.DataFrame(all_rows, columns = headers)
    per_game.to_csv(f"../data/raw/{first_year}_{last_year}_per_game.csv", index=False)

In [9]:
scrap_advanced(2006, 2024)
scrap_mvps(2006,2024)
scrap_standings(2006,2024)
scrap_per_game(2006, 2024)

Season 2006 done
Season 2007 done
Season 2008 done
Season 2009 done
Season 2010 done
Season 2011 done
Season 2012 done
Season 2013 done
Season 2014 done
Season 2015 done
Season 2016 done
Season 2017 done
Season 2018 done
Season 2019 done
Season 2020 done
Season 2021 done
Season 2022 done
Season 2023 done
Season 2024 done
Season 2006 done
Season 2007 done
Season 2008 done
Season 2009 done
Season 2010 done
Season 2011 done
Season 2012 done
Season 2013 done
Season 2014 done
Season 2015 done
Season 2016 done
Season 2017 done
Season 2018 done
Season 2019 done
Season 2020 done
Season 2021 done
Season 2022 done
Season 2023 done
Season 2024 done
Season 2006 done
Season 2007 done
Season 2008 done
Season 2009 done
Season 2010 done
Season 2011 done
Season 2012 done
Season 2013 done
Season 2014 done
Season 2015 done
Season 2016 done
Season 2017 done
Season 2018 done
Season 2019 done
Season 2020 done
Season 2021 done
Season 2022 done
Season 2023 done
Season 2024 done
Season 2006 done
Season 2007 do