In [1]:
import os
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import requests
from bs4 import BeautifulSoup

In [None]:
!playwright install 

In [2]:
SEASONS = list(range(2016,2023))

In [3]:
SEASONS

[2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [4]:
DATA_DIR="data"
STANDINGS_DIR =os.path.join(DATA_DIR,"standings")
SCORES_DIR=os.path.join(DATA_DIR,"scores")

In [None]:
# Define an asynchronous function to fetch HTML content from a web page
async def get_html(url, selector, sleep=5, retries=3):
    # Initialize the HTML content to None
    html = None
    # Loop through the specified number of retries
    for i in range(1, retries+1):
        # Wait for a specified number of seconds before retrying (the wait time increases with each retry)
        time.sleep(sleep * i)
        try:
            # Create a new instance of Playwright
            async with async_playwright() as p:
                # Launch a new Chromium browser
                browser = await p.chromium.launch()
                # Create a new page in the browser
                page = await browser.new_page()
                # Navigate to the specified URL
                await page.goto(url)
                # Print the title of the page (for debugging purposes)
                print(await page.title())
                # Extract the HTML content using the specified CSS selector
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            # If a timeout occurs, print an error message and continue to the next iteration of the loop
            print(f"Timeout error on {url}")
            continue
        else:
            # If the HTML content is successfully extracted, break out of the loop and return the HTML content
            break
    return html

In [None]:
season = 2016 
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-november.html"

In [None]:
url

In [None]:
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-november.html"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    links = [a.get("href") for a in soup.select("#content .filter a")]
    print(links)
else:
    print(f"Error: {response.status_code}")

In [None]:
standing_pages=[f"https://www.basketball-reference.com{l}" for l in links]

In [None]:
standing_pages

In [None]:
for url in standing_pages :
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        table = soup.select_one("#all_schedule")
        
    else:
        print(f"Error: {response.status_code}")
    save_path = os.path.join(STANDINGS_DIR,url.split('/')[-1])
    if os.path.exists(save_path):
        continue

    with open(save_path, "w") as f:
        f.write(str(table)) 
            


In [5]:
def fetch_standings_pages(season):
    """
    Fetches and saves the standings pages for the given NBA season.

    Args:
        season (str): the NBA season (in the format "YYYY-ZZ", e.g. "2021-22")

    Returns:
        None
    """

    # Fetch the links to the standings pages for the given NBA season
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-november.html"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        links = [a.get("href") for a in soup.select("#content .filter a")]
    else:
        print(f"Error: {response.status_code}")
        return

    # Fetch and save the HTML content of each standings page
    for link in links:
        url = f"https://www.basketball-reference.com{link}"
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            table = soup.select_one("#all_schedule")
        else:
            print(f"Error: {response.status_code}")
            continue

        save_path = os.path.join(STANDINGS_DIR, link.split("/")[-1])
        if os.path.exists(save_path):
            continue

        with open(save_path, "w", encoding="utf-8") as f:
            f.write(str(table))

    print(f"All standings pages for {season} have been fetched and saved.")

In [None]:
fetch_standings_pages(2014)

In [6]:
for season in SEASONS :
    fetch_standings_pages(season)

All standings pages for 2016 have been fetched and saved.
All standings pages for 2017 have been fetched and saved.
All standings pages for 2018 have been fetched and saved.
Error: 429
Error: 429
Error: 429
All standings pages for 2019 have been fetched and saved.
Error: 429
Error: 429
Error: 429


In [7]:
standings_files=os.listdir(STANDINGS_DIR)

In [8]:
standings_files

['NBA_2016_games-april.html',
 'NBA_2016_games-december.html',
 'NBA_2016_games-february.html',
 'NBA_2016_games-january.html',
 'NBA_2016_games-june.html',
 'NBA_2016_games-march.html',
 'NBA_2016_games-may.html',
 'NBA_2016_games-november.html',
 'NBA_2016_games-october.html',
 'NBA_2017_games-april.html',
 'NBA_2017_games-december.html',
 'NBA_2017_games-february.html',
 'NBA_2017_games-january.html',
 'NBA_2017_games-june.html',
 'NBA_2017_games-march.html',
 'NBA_2017_games-may.html',
 'NBA_2017_games-november.html',
 'NBA_2017_games-october.html',
 'NBA_2018_games-april.html',
 'NBA_2018_games-december.html',
 'NBA_2018_games-february.html',
 'NBA_2018_games-january.html',
 'NBA_2018_games-june.html',
 'NBA_2018_games-march.html',
 'NBA_2018_games-may.html',
 'NBA_2018_games-november.html',
 'NBA_2018_games-october.html',
 'NBA_2019_games-december.html',
 'NBA_2019_games-february.html',
 'NBA_2019_games-january.html',
 'NBA_2019_games-march.html',
 'NBA_2019_games-november.html',

In [9]:
def scrape_game(standings_file):
    with open(standings_file,'r') as f :
        html=f.read()
    soup=BeautifulSoup(html)
    links=soup.find_all('a')
    hrefs=[l.get("href") for l in links]
    box_scores=[l for l in hrefs if l and "boxscore" in l and '.html' in l ]
    box_scores=[f"https://www.basketball-reference.com{l}" for l in box_scores]
    for url in box_scores:
        save_path = os.path.join(SCORES_DIR,url.split("/")[-1])
        if os.path.exists (save_path):
            continue
        html = soup.select_one("#content")
        if not html :
            continue
        with open(save_path,"w+")as f:
            f.write(str(html))

In [10]:
files = [s for s in standings_files if str(season) in s ]
for f in files :
    filepath = os.path.join(STANDINGS_DIR,f)
    scrape_game(filepath)