In [1]:
import pandas as pd
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import requests
import asyncio

In [2]:
data_dir = "data"
standings_dir = os.path.join(data_dir, "standings")
score_dir = os.path.join(data_dir, "scores")
#directories where we are going store our data

In [3]:
async def get_html(url, selector, sleep=5):            #async allows to perform non-blocking In/Out operations
    try:
        response = requests.get(url)
        response.raise_for_status()                    #check if the request was successful
        html_content = response.text                   #if the request is successful, it retrieves the HTML content from the response
        await asyncio.sleep(sleep)                     #asynchronously waits for the specified duration using asyncio.sleep

        return html_content
    except requests.exceptions.RequestException as e:
    
        print(f"Error occurred while fetching HTML from {url}: {e}")
        return None                                     #return None if there's an error

In [4]:
seasons = list(range(1994,2024))                      #set the seasons range for our data

In [5]:
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    page = requests.get(url)
    print(page.status_code)

    soup = BeautifulSoup(page.text, "html.parser")
    html_divs = soup.find_all('div', class_='filter')  

    for div in html_divs:
        soup2 = BeautifulSoup(str(div), "html.parser")  
        links = soup2.find_all("a")
        hrefs = [l.get('href') for l in links]


        standings_pages = [f"https://www.basketball-reference.com{l}" for l in hrefs]

        
        for url in standings_pages:
            save_path = os.path.join(standings_dir, url.split("/")[-1])
            if os.path.exists(save_path):
                continue

            html = await get_html(url, "#all_schedule")
            with open(save_path, "w+") as f:
                f.write(html)
                


In [6]:
for season in seasons:             #make the loop for each season
    await scrape_season(season)    #ensuring that each scrape_season call is completed before moving on to the next one
#200 shows that connection is good and season is scrapining

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [7]:
standings_files = os.listdir(standings_dir)

In [8]:
async def scrape_game(standings_files):    
    with open(standings_files, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    href = [l.get('href') for l in links]
    box_scores = [l for l in href if l and "boxscore" in l and ".html" in l]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in box_scores]

    for url in box_scores:
        save_path = os.path.join(score_dir, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = await get_html(url, "#content")
        if not html:
            continue        
        with open(save_path, "w+") as f:
            f.write(html)

In [9]:
standings_files = [s for s in standings_files if ".html" in s]

In [10]:
standings_files = [s for s in standings_files if any(str(year) in s for year in [2008, 2009, 2014, 2017, 1994, 1995])]

In [11]:
for f in standings_files:
    file_path = os.path.join(standings_dir, f)
        
    await scrape_game(file_path)

In [18]:
#i stopped the function above, because it's already running for 24h, so let's check how many file we got

In [13]:
score_dir = "data/scores"

In [14]:
box_score = os.listdir(score_dir)

In [15]:
len(box_score)       #check amount of files

13548

## Parsing data

In [11]:
box_score = [os.path.join(score_dir, f) for f in box_score]   #set the full path and clear some random weird files

In [12]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()

    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]   #remove the row over the header of our table
    [s.decompose() for s in soup.select("tr.thead")]         #remove the row in the middle of our table
    return soup

In [13]:
box_score = box_score[0]
soup = parse_html(box_score)

In [15]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs = {"id":"line_score"})[0]    #set the attributes by site inspector
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    line_score = line_score[["team","total"]]        #we take only team and total, because not always there 4 quarters, sometimes there are could be overtimes
    return line_score

In [153]:
box_score

'data/scores/200804090TOR.html'

In [165]:
soup = BeautifulSoup(response.content, 'html.parser')
dynamic_content = soup.find('table', id="line_score")

In [170]:
soup = BeautifulSoup(requests.get("https://www.basketball-reference.com/boxscores/200804090TOR.html").text)
# first we should find our table object:
table = soup.find('div', id="suppress_all stats_table")
header = []
rows = []
for i, row in enumerate(table.find_all('tr')):
    if i == 0:
        header = [el.text.strip() for el in row.find_all('th')]
    else:
        rows.append([el.text.strip() for el in row.find_all('td')])

AttributeError: 'NoneType' object has no attribute 'find_all'

In [161]:
/html/body/div[2]/div[4]/div[6]/div[1]/div/div[3]/table
#line_score
//*[@id="line_score"]

TypeError: 'NoneType' object is not subscriptable

In [173]:
url ="https://www.basketball-reference.com/boxscores/200804090TOR.html"
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('div', {'id': 'div_line_score', 'class': 'table_container is_setup'})

if table:
    table_content = table.find('table').text()  # You can also use .text if you do not want HTML tags
    with open('table.html', 'w') as f:
        f.write(table_content)
    print('Table downloaded successfully!')
else:
    print('Table not found on the webpage.')

Table not found on the webpage.


In [126]:
table = soup.find('table', id="line_score")

In [141]:
url = 'https://www.basketball-reference.com/boxscores/200804090TOR.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
dynamic_content = soup.find('table', id='line_score')


In [18]:
line_score = pd.read_html(str(soup))[0]
line_score.head()

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,Charlie Villanueva,46:55,14,25,0.56,7,12,0.583,3,5,...,0,12,12,1,1,0,2,3,38,-18
1,Michael Redd,36:50,6,15,0.4,1,4,0.25,4,5,...,2,1,3,2,0,0,2,2,17,-27
2,Ramon Sessions,35:49,3,12,0.25,0,1,0.0,0,0,...,1,3,4,10,0,0,4,0,6,-12
3,Andrew Bogut,33:17,5,10,0.5,0,0,,0,0,...,5,4,9,2,1,2,2,3,10,-26
4,Desmond Mason,19:44,1,5,0.2,0,0,,0,0,...,1,1,2,3,0,0,2,0,2,-16


In [19]:
soup

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="https://cdn.ssref.net/req/202301312" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAME = '__tcfapiLo