In [1]:
import os
import asyncio
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

import time
import chardet

In [2]:
# defining range of recent NBA seasons to scrape (update the years to scape any number of season in recorded NBA history)

SEASONS = list(range(2023, 2025))

In [3]:
SEASONS

[2023, 2024]

In [4]:
# Creating directories on local drive for scraped data (STANDINGS_DIR was for initial training data to refine model. STANDINGS_UPDATED_DIR is for up to date data for future predictions of 2024 season)

DATA_DIR = "Data"
STANDINGS_DIR = os.path.join(DATA_DIR, "Standings")
STANDINGS_UPDATED_DIR = os.path.join(DATA_DIR, "Updated_Standings")
SCORES_DIR = os.path.join(DATA_DIR, "Scores")

os.makedirs(STANDINGS_DIR, exist_ok=True)
os.makedirs(STANDINGS_UPDATED_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)

In [5]:
standings_files = os.listdir(STANDINGS_UPDATED_DIR)


In [6]:
# get HTML function: passing in URL, selector, sleep interval and a number of retries

async def get_html(url, selector, sleep=5, retries=6):
    html = None
    # initialise playwright, browser
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        page = await browser.new_page()
        for i in range(1, retries+1):
            
            # wrap it into a try except block
            try:
                await page.goto(url, timeout=30000)
                print(await page.title())
                html = await page.inner_html(selector, timeout=30000)
            except PlaywrightTimeout as e:
                print(f"Timeout error on {url}: {str(e)}")
                await asyncio.sleep(sleep * i)
            else:
                break
        await page.close()
        await browser.close()
    return html

In [7]:
season = 2023

url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"


In [8]:
# scrape season by season function (will then break this down into box_scores for each game per season)

async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter")
    
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a") 
    href = [l["href"] for l in links]
    standings_pages = [f"https://basketball-reference.com{l}" for l in href]
    
    for url in standings_pages:
        save_path = os.path.join(STANDINGS_UPDATED_DIR, url.split("/")[-1])
        if os.path.exists(save_path):
            continue 
            
        html = await get_html(url, "#all_schedule")
        with open(save_path, "w+") as f:
            f.write(html)
    

In [9]:
# parsing each individual box score per season 
async def scrape_game (standings_file):
    with open (standings_file, 'r', encoding='ISO-8859-1') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    hrefs = [l.get("href") for l in links]
    box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in box_scores]

    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split("/") [-1])
        if os.path.exists(save_path):
            continue 

        html = await get_html(url, "#content")
        if not html:
            continue 
        with open(save_path, "w+") as f:
            f.write(html)
    

In [10]:
# downloading the box scores to drive

for f in standings_files:
    filepath = os.path.join(STANDINGS_UPDATED_DIR, f)
        
    await scrape_game(filepath)

Jazz vs Raptors, February 10, 2023 | Basketball-Reference.com
Timberwolves vs Grizzlies, February 10, 2023 | Basketball-Reference.com
Rockets vs Heat, February 10, 2023 | Basketball-Reference.com
Cavaliers vs Pelicans, February 10, 2023 | Basketball-Reference.com
Thunder vs Trail Blazers, February 10, 2023 | Basketball-Reference.com
Mavericks vs Kings, February 10, 2023 | Basketball-Reference.com
Bucks vs Clippers, February 10, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202302110BRK.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202302110BRK.html", waiting until "load"
76ers vs Nets, February 11, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202302110CHO.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202302110CHO.html", waiting until "load"
Nuggets vs Hornets, February 11, 2023 | Basketball-R

Heat vs Bucks, February 24, 2023 | Basketball-Reference.com
Nets vs Bulls, February 24, 2023 | Basketball-Reference.com
Hornets vs Timberwolves, February 24, 2023 | Basketball-Reference.com
Rockets vs Warriors, February 24, 2023 | Basketball-Reference.com
Thunder vs Suns, February 24, 2023 | Basketball-Reference.com
Kings vs Clippers, February 24, 2023 | Basketball-Reference.com
Raptors vs Pistons, February 25, 2023 | Basketball-Reference.com
Heat vs Hornets, February 25, 2023 | Basketball-Reference.com
Pacers vs Magic, February 25, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202302250NYK.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202302250NYK.html", waiting until "load"
Pelicans vs Knicks, February 25, 2023 | Basketball-Reference.com
Nuggets vs Grizzlies, February 25, 2023 | Basketball-Reference.com
Celtics vs 76ers, February 25, 2023 | Basketball-Reference.com
Timeout error on https

Bucks vs Pacers, January 3, 2024 | Basketball-Reference.com
Thunder vs Hawks, January 3, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401030HOU.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401030HOU.html", waiting until "load"
Nets vs Rockets, January 3, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401030MEM.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401030MEM.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202401030MEM.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401030MEM.html", waiting until "load"
Raptors vs Grizzlies, January 3, 2024 | Basketball-Reference.com
Pelicans vs Timberwolves, January 3, 2024 | Basketball-Reference.com
Trail Blazers vs Mavericks, January 3, 2024 | Bas

Pistons vs Nuggets, January 7, 2024 | Basketball-Reference.com
Grizzlies vs Suns, January 7, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401070GSW.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401070GSW.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202401070GSW.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401070GSW.html", waiting until "load"
Raptors vs Warriors, January 7, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401070LAL.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401070LAL.html", waiting until "load"
Clippers vs Lakers, January 7, 2024 | Basketball-Reference.com
Bulls vs Hornets, January 8, 2024 | Basketball-Reference.com
Celtics vs Pacers, January 8, 2024 | Basketball-Ref

Timeout error on https://www.basketball-reference.com/boxscores/202401130BOS.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401130BOS.html", waiting until "load"
Rockets vs Celtics, January 13, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401130ATL.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401130ATL.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202401130ATL.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401130ATL.html", waiting until "load"
Wizards vs Hawks, January 13, 2024 | Basketball-Reference.com
Knicks vs Grizzlies, January 13, 2024 | Basketball-Reference.com
Warriors vs Bucks, January 13, 2024 | Basketball-Reference.com
Magic vs Thunder, January 13, 2024 | Basketball-Reference.com
Pelicans vs Mavericks, January 13, 2024 | Basket

Timeout error on https://www.basketball-reference.com/boxscores/202401200CHI.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401200CHI.html", waiting until "load"
Grizzlies vs Bulls, January 20, 2024 | Basketball-Reference.com
Jazz vs Rockets, January 20, 2024 | Basketball-Reference.com
Thunder vs Timberwolves, January 20, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401210LAC.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401210LAC.html", waiting until "load"
Nets vs Clippers, January 21, 2024 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202401210ORL.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202401210ORL.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202401210ORL.html: Timeout 30000ms exceeded.

Clippers vs Kings, March 3, 2023 | Basketball-Reference.com
Timberwolves vs Lakers, March 3, 2023 | Basketball-Reference.com
Raptors vs Wizards, March 4, 2023 | Basketball-Reference.com
Pistons vs Cavaliers, March 4, 2023 | Basketball-Reference.com
Hawks vs Heat, March 4, 2023 | Basketball-Reference.com
Rockets vs Spurs, March 4, 2023 | Basketball-Reference.com
76ers vs Bucks, March 4, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303040SAC.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303040SAC.html", waiting until "load"
Timberwolves vs Kings, March 4, 2023 | Basketball-Reference.com
Suns vs Mavericks, March 5, 2023 | Basketball-Reference.com
Pacers vs Bulls, March 5, 2023 | Basketball-Reference.com
Warriors vs Lakers, March 5, 2023 | Basketball-Reference.com
Hornets vs Nets, March 5, 2023 | Basketball-Reference.com
Trail Blazers vs Magic, March 5, 2023 | Basketball-Reference.com
S

Raptors vs Lakers, March 10, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303110LAC.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303110LAC.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202303110LAC.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303110LAC.html", waiting until "load"
Knicks vs Clippers, March 11, 2023 | Basketball-Reference.com
Jazz vs Hornets, March 11, 2023 | Basketball-Reference.com
Pacers vs Pistons, March 11, 2023 | Basketball-Reference.com
Heat vs Magic, March 11, 2023 | Basketball-Reference.com
Celtics vs Hawks, March 11, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303110HOU.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303110HOU.html", waiting until "load"
Bull

Pelicans vs Rockets, March 17, 2023 | Basketball-Reference.com
Grizzlies vs Spurs, March 17, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303170POR.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303170POR.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202303170POR.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303170POR.html", waiting until "load"
Celtics vs Trail Blazers, March 17, 2023 | Basketball-Reference.com
Mavericks vs Lakers, March 17, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303180NYK.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303180NYK.html", waiting until "load"
Nuggets vs Knicks, March 18, 2023 | Basketball-Reference.com
Magic vs Clippers, March 18, 2023 | Basketball-

Thunder vs Clippers, March 23, 2023 | Basketball-Reference.com
Pacers vs Celtics, March 24, 2023 | Basketball-Reference.com
Spurs vs Wizards, March 24, 2023 | Basketball-Reference.com
Pistons vs Raptors, March 24, 2023 | Basketball-Reference.com
Rockets vs Grizzlies, March 24, 2023 | Basketball-Reference.com
Hornets vs Mavericks, March 24, 2023 | Basketball-Reference.com
Bucks vs Jazz, March 24, 2023 | Basketball-Reference.com
76ers vs Warriors, March 24, 2023 | Basketball-Reference.com
Bulls vs Trail Blazers, March 24, 2023 | Basketball-Reference.com
Suns vs Kings, March 24, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303240LAL.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303240LAL.html", waiting until "load"
Thunder vs Lakers, March 24, 2023 | Basketball-Reference.com
Pacers vs Hawks, March 25, 2023 | Basketball-Reference.com
Nets vs Heat, March 25, 2023 | Basketball-Reference.c

Jazz vs Spurs, March 29, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303290PHO.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303290PHO.html", waiting until "load"
Timberwolves vs Suns, March 29, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303290POR.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303290POR.html", waiting until "load"
Kings vs Trail Blazers, March 29, 2023 | Basketball-Reference.com
Celtics vs Bucks, March 30, 2023 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202303300DEN.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202303300DEN.html", waiting until "load"
Pelicans vs Nuggets, March 30, 2023 | Basketball-Reference.com
Bulls vs Hornets, March 31, 2023 | Basketball-Referenc

Grizzlies vs Mavericks, October 22, 2022 | Basketball-Reference.com
Thunder vs Nuggets, October 22, 2022 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202210220SAC.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202210220SAC.html", waiting until "load"
Clippers vs Kings, October 22, 2022 | Basketball-Reference.com
Trail Blazers vs Lakers, October 23, 2022 | Basketball-Reference.com
Hornets vs Hawks, October 23, 2022 | Basketball-Reference.com
Wizards vs Cavaliers, October 23, 2022 | Basketball-Reference.com
Jazz vs Pelicans, October 23, 2022 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202210230OKC.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202210230OKC.html", waiting until "load"
Timberwolves vs Thunder, October 23, 2022 | Basketball-Reference.com
Kings vs Warriors, October 23, 2022 | Basketball-Refe

76ers vs Bulls, October 29, 2022 | Basketball-Reference.com
Hawks vs Bucks, October 29, 2022 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202210290DAL.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202210290DAL.html", waiting until "load"
Thunder vs Mavericks, October 29, 2022 | Basketball-Reference.com
Grizzlies vs Jazz, October 29, 2022 | Basketball-Reference.com
Pelicans vs Clippers, October 30, 2022 | Basketball-Reference.com
Wizards vs Celtics, October 30, 2022 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202210300CLE.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202210300CLE.html", waiting until "load"
Timeout error on https://www.basketball-reference.com/boxscores/202210300CLE.html: Timeout 30000ms exceeded.
navigating to "https://www.basketball-reference.com/boxscores/202210300CLE.html", waitin

In [132]:
box_scores

['https://www.basketball-reference.com/boxscores/202305010BOS.html',
 'https://www.basketball-reference.com/boxscores/202305010DEN.html',
 'https://www.basketball-reference.com/boxscores/202305020NYK.html',
 'https://www.basketball-reference.com/boxscores/202305020GSW.html',
 'https://www.basketball-reference.com/boxscores/202305030BOS.html',
 'https://www.basketball-reference.com/boxscores/202305040GSW.html',
 'https://www.basketball-reference.com/boxscores/202305050PHI.html',
 'https://www.basketball-reference.com/boxscores/202305050PHO.html',
 'https://www.basketball-reference.com/boxscores/202305060MIA.html',
 'https://www.basketball-reference.com/boxscores/202305060LAL.html',
 'https://www.basketball-reference.com/boxscores/202305070PHI.html',
 'https://www.basketball-reference.com/boxscores/202305070PHO.html',
 'https://www.basketball-reference.com/boxscores/202305080MIA.html',
 'https://www.basketball-reference.com/boxscores/202305080LAL.html',
 'https://www.basketball-reference

In [124]:
#def detect_encoding(file_path):
    #with open(file_path, 'rb') as f:
        #rawdata = f.read()
    #return chardet.detect(rawdata)['encoding']

# Use the function on your file
#encoding = detect_encoding('/Users/spencergreen/NBA-Prediction-Model/Data/Updated_Standings/NBA_2023_games-april.html')
#print(f"Detected encoding: {encoding}")


Detected encoding: ascii


In [131]:
#def print_file_up_to_byte(file_path, byte_position):
    #with open(file_path, 'rb') as file:
        #content = file.read(byte_position)
    #print(content)

#print_file_up_to_byte('/Users/spencergreen/NBA-Prediction-Model/Data/Updated_Standings/NBA_2023_games-june.html', 3131)

b'\n\n<div class="section_heading assoc_schedule has_controls" id="schedule_sh">\n  <span class="section_anchor" id="schedule_link" data-label="June Schedule"></span><h2>June Schedule</h2>    <div class="section_heading_text">\n      <ul>\n      <li class="hasmore"><span>Share &amp; Export</span><div><ul><li><button class="tooltip" tip="Use a customizable report creator that can<br>output HTML, CSV, or a shareable link." type="button" id="share_on_schedule">Modify, Export &amp; Share Table</button></li><li><button class="tooltip" tip="Convert the table below to comma-separated values<br>suitable for use with Excel" type="button">Get as Excel Workbook</button><a id="dlink" style="display: none;"></a></li><li><button class="tooltip" tip="Get a link directly to this table on this page" type="button">Get table as CSV (for Excel)</button></li><li><button class="tooltip" tip="" type="button" id="a_schedule" name="schedule" href="#schedule">Get Link to Table</button></li><li><button class="to