In [1]:
import os
import asyncio
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

import time
import chardet

In [2]:
# defining range of recent NBA seasons to scrape (update the years to scape any number of season in recorded NBA history)

SEASONS = list(range(2023, 2025))

In [3]:
SEASONS

[2023, 2024]

In [4]:
# Creating directories on local drive for scraped data (STANDINGS_DIR was for initial training data to refine model. STANDINGS_UPDATED_DIR is for up to date data for future predictions of 2024 season)

DATA_DIR = "Data"
STANDINGS_DIR = os.path.join(DATA_DIR, "Standings")
STANDINGS_UPDATED_DIR = os.path.join(DATA_DIR, "Updated_Standings")
SCORES_DIR = os.path.join(DATA_DIR, "Scores")

os.makedirs(STANDINGS_DIR, exist_ok=True)
os.makedirs(STANDINGS_UPDATED_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)

In [5]:
standings_files = os.listdir(STANDINGS_UPDATED_DIR)


In [6]:
# get HTML function: passing in URL, selector, sleep interval and a number of retries

async def get_html(url, selector, sleep=5, retries=6):
    html = None
    # initialise playwright, browser
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        page = await browser.new_page()
        for i in range(1, retries+1):
            
            # wrap it into a try except block
            try:
                await page.goto(url, timeout=30000)
                print(await page.title())
                html = await page.inner_html(selector, timeout=30000)
            except PlaywrightTimeout as e:
                print(f"Timeout error on {url}: {str(e)}")
                await asyncio.sleep(sleep * i)
            else:
                break
        await page.close()
        await browser.close()
    return html

In [7]:
season = 2023

url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"


In [8]:
# scrape season by season function (will then break this down into box_scores for each game per season)

async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter")
    
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all("a") 
    href = [l["href"] for l in links]
    standings_pages = [f"https://basketball-reference.com{l}" for l in href]
    
    for url in standings_pages:
        save_path = os.path.join(STANDINGS_UPDATED_DIR, url.split("/")[-1])
        if os.path.exists(save_path):
            continue 
            
        html = await get_html(url, "#all_schedule")
        with open(save_path, "w+") as f:
            f.write(html)
    

In [9]:
# parsing each individual box score per season 
async def scrape_game (standings_file):
    with open (standings_file, 'r', encoding='ISO-8859-1') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    hrefs = [l.get("href") for l in links]
    box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in box_scores]

    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split("/") [-1])
        if os.path.exists(save_path):
            continue 

        html = await get_html(url, "#content")
        if not html:
            continue 
        with open(save_path, "w+") as f:
            f.write(html)
    

In [11]:
# downloading the box scores to drive

for f in standings_files:
    filepath = os.path.join(STANDINGS_UPDATED_DIR, f)
        
    await scrape_game(filepath)

In [132]:
box_scores

['https://www.basketball-reference.com/boxscores/202305010BOS.html',
 'https://www.basketball-reference.com/boxscores/202305010DEN.html',
 'https://www.basketball-reference.com/boxscores/202305020NYK.html',
 'https://www.basketball-reference.com/boxscores/202305020GSW.html',
 'https://www.basketball-reference.com/boxscores/202305030BOS.html',
 'https://www.basketball-reference.com/boxscores/202305040GSW.html',
 'https://www.basketball-reference.com/boxscores/202305050PHI.html',
 'https://www.basketball-reference.com/boxscores/202305050PHO.html',
 'https://www.basketball-reference.com/boxscores/202305060MIA.html',
 'https://www.basketball-reference.com/boxscores/202305060LAL.html',
 'https://www.basketball-reference.com/boxscores/202305070PHI.html',
 'https://www.basketball-reference.com/boxscores/202305070PHO.html',
 'https://www.basketball-reference.com/boxscores/202305080MIA.html',
 'https://www.basketball-reference.com/boxscores/202305080LAL.html',
 'https://www.basketball-reference

In [124]:
#def detect_encoding(file_path):
    #with open(file_path, 'rb') as f:
        #rawdata = f.read()
    #return chardet.detect(rawdata)['encoding']

# Use the function on your file
#encoding = detect_encoding('/Users/spencergreen/NBA-Prediction-Model/Data/Updated_Standings/NBA_2023_games-april.html')
#print(f"Detected encoding: {encoding}")


Detected encoding: ascii


In [131]:
#def print_file_up_to_byte(file_path, byte_position):
    #with open(file_path, 'rb') as file:
        #content = file.read(byte_position)
    #print(content)

#print_file_up_to_byte('/Users/spencergreen/NBA-Prediction-Model/Data/Updated_Standings/NBA_2023_games-june.html', 3131)

b'\n\n<div class="section_heading assoc_schedule has_controls" id="schedule_sh">\n  <span class="section_anchor" id="schedule_link" data-label="June Schedule"></span><h2>June Schedule</h2>    <div class="section_heading_text">\n      <ul>\n      <li class="hasmore"><span>Share &amp; Export</span><div><ul><li><button class="tooltip" tip="Use a customizable report creator that can<br>output HTML, CSV, or a shareable link." type="button" id="share_on_schedule">Modify, Export &amp; Share Table</button></li><li><button class="tooltip" tip="Convert the table below to comma-separated values<br>suitable for use with Excel" type="button">Get as Excel Workbook</button><a id="dlink" style="display: none;"></a></li><li><button class="tooltip" tip="Get a link directly to this table on this page" type="button">Get table as CSV (for Excel)</button></li><li><button class="tooltip" tip="" type="button" id="a_schedule" name="schedule" href="#schedule">Get Link to Table</button></li><li><button class="to