Libraries

In [1]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
from zipfile import ZipFile
import time
from csv import writer, reader
from decimal import *
import pandas as pd

Global variables

In [2]:
SEASONS = list(range(1997, 2024))
months = ['november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']

In [3]:
DATA_DIR = "testing_data_collection"
STANDINGS_DIR = os.path.join(DATA_DIR, "games")
curr_dir = '/Users/chrisjeff/Desktop/Origin'
#SCORES_DIR = os.path.join(DATA_DIR, "scores")

Function that grabs specified html content from a url

In [4]:
#get html function
async def get_html(url, selector, sleep=5, retries=5, timeout=0):
  html = None
  for i in range(1, retries+1):
    time.sleep(sleep * i)

    try:
      async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url, timeout=30000) #if can't load page in 30s, timeout error gets thrown
        print(url)
        html = await page.inner_html(selector,timeout=60000) #grab all html with identifier named {selector} within 60s
        print('got html')
    except PlaywrightTimeout:
      print(f"Timeout on {url}")
      if i == retries: #timesout for 5th time
        print('No games this month')
      continue
    else:
      break
  return html

Gets links for stats to every game between a range of months, between a range of years

In [None]:
for season in SEASONS:
    for month in months:
        file_of_links = f'games_from_{month}-{season}.csv'
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html'
        html = await get_html(url, '#schedule') #goes to url and returns all html
        if not html: #if html=none then a timeout occured, multiple times
            continue

        soup = BeautifulSoup(html, 'html.parser')
        row = soup.find_all('a')
        links = [_['href'] for _ in row] #grabs all links on a page
        all_links = [f"https://basketball-reference.com{game}" for game in links]

        indiv_game_boxscores = []
        for url in all_links:
            if 'boxscores' in url and url[-5::] == '.html':
                indiv_game_boxscores.append(url)
        print(indiv_game_boxscores) #list of all links to indiv games
        with open(f'{curr_dir}Collecting_Data/Links_to_games/{file_of_links}', 'w') as f_object:
            writer_object = writer(f_object)
            writer_object.writerow(indiv_game_boxscores)
            f_object.close()
            

Grabs the tables from a given web page

In [5]:
async def get_game_html_data(url):
    html = await get_html(url, '#content') #grabs div with id named content 
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', class_='sortable stats_table') #tables is a list of html <table>
    return tables
    

Zips

Check if a game is in hte zipped archive

In [None]:
def check_if_in_arch(game_path) -> bool:
  with ZipFile(f'{curr_dir}/arch.zip') as zip:
    game_path = game_path[1:]
    if game_path in zip.namelist():
      return True
    zip.close()
  return False

In [131]:
print(check_if_in_arch(f'{curr_dir}/200403010BOS.zip'))
ZipFile('arch.zip').namelist()


True


['Users/chrisjeff/Desktop/Origin/200403010BOS.zip',
 'Users/chrisjeff/Desktop/Origin/200403010WAS.zip']

In [133]:
games = ['200403010BOS', '200403010WAS']

if os.path.isfile(f'{curr_dir}/{games[0]}'):
  os.remove(f'{curr_dir}/{games[0]}')
if os.path.isfile(f'{curr_dir}/{games[1]}'):
  os.remove(f'{curr_dir}/{games[1]}')
if os.path.isfile(f'{curr_dir}/arch.zip'):
  os.remove(f'{curr_dir}/arch.zip')

for game_num, game in enumerate(games): #go through every game
  with ZipFile(f'{curr_dir}/{game}.zip','w') as zip: #create a zip file for a game
    zip.write(f'{curr_dir}/Collecting_Data/game_html/{game}.html')
    zip.close()
  with ZipFile(f'{curr_dir}/arch.zip', 'a') as zip: #append zip file to zipped archive
    zip.write(f'{curr_dir}/{game}.zip')
    zip.close()
  os.remove(f'{curr_dir}/{game}.zip')

with ZipFile('arch.zip', 'r') as zip: #prints path of every file in archive
  print(zip.namelist())
  for file in zip.namelist(): #grabs path of every indv file in archive
    print(file)
    if not check_if_in_arch(f'/{file}'): #if file is in archive, then skip
      filename = file[-16::]
      with ZipFile(file, 'r') as temp_:
        temp_.extract(file)
        
        temp_.close()
  zip.close()

['Users/chrisjeff/Desktop/Origin/200403010BOS.zip', 'Users/chrisjeff/Desktop/Origin/200403010WAS.zip']
Users/chrisjeff/Desktop/Origin/200403010BOS.zip
Users/chrisjeff/Desktop/Origin/200403010WAS.zip


Collect HTML for indv games

In [7]:
#Should be over 32881 game_html files
get_data = f'{curr_dir}/Collecting_Data' #directory of where collected data is put
html_game_dir = f'{get_data}/game_html' #directory of where html of games are put
for file_number, file_name in enumerate(os.listdir(f'{get_data}/Links_to_games')):
  print(file_name)
  all_links = []
  with open(f'{get_data}/Links_to_games/{file_name}', 'r') as file_object:
    csv_file = reader(file_object)

    all_links = list(csv_file)[0] #all_links is a list of links to games for a given month

  for single_link in all_links:
    new_file = single_link[-17::] #sets name of new_file to year/month/day/team of a game
    if new_file in os.listdir(f'{html_game_dir}'):
      continue
    print(f'visiting {single_link}')
    with open(f'{html_game_dir}/{new_file}', 'w') as f_object:
      html = await get_game_html_data(single_link)
      #if not html: #if timeout occurs, file gets deleted
        #os.remove(f'{html_game_dir}{new_file}')
        #continue
      f_object.write(str(html))
      print('finished_writing')
      f_object.close()
  print(f'completed {file_number+1} / 208 files')

games_from_march-2004.csv
completed 1 / 208 files
games_from_march-2010.csv
completed 2 / 208 files
games_from_may-2007.csv
completed 3 / 208 files
games_from_may-2013.csv
completed 4 / 208 files
games_from_november-2022.csv
completed 5 / 208 files
games_from_april-2007.csv
completed 6 / 208 files
games_from_april-2013.csv
completed 7 / 208 files
games_from_june-1999.csv
completed 8 / 208 files
games_from_june-1998.csv
completed 9 / 208 files
games_from_april-2012.csv
completed 10 / 208 files
games_from_april-2006.csv
completed 11 / 208 files
games_from_november-2023.csv
completed 12 / 208 files
games_from_may-2012.csv
completed 13 / 208 files
games_from_may-2006.csv
completed 14 / 208 files
games_from_march-2011.csv
completed 15 / 208 files
games_from_march-2005.csv
completed 16 / 208 files
games_from_march-2013.csv
completed 17 / 208 files
games_from_march-2007.csv
completed 18 / 208 files
games_from_january-1998.csv
completed 19 / 208 files
games_from_may-2010.csv
completed 20 / 208

TypeError: object of type 'NoneType' has no len()

Functions that will convert str data type into correct, useful data types

In [6]:
def min_to_sec(time):        #converts time to an integer
    if time == 'Did Not Play' or time == 'Did Not Dress' or time == None or time == '\xa0':
        return 0
    
    time_list = time.split(':')
    return int(time_list[0]) * 60 + int(time_list[1])

def int_conv(num):           #converts number to int
    if num == '':
        return None
    
    return int(num)

def float_conv(num):       #converts number to float to nearest 1000th place, rounding down
    if num == '':
        return None
    
    return float(Decimal(str(num)).quantize(Decimal('.001'), rounding=ROUND_DOWN))

def plus_minus_conv(val):
    if val == '':
        return None
    if val == '0':
        return 0
    
    value_symbol = val[0]
    num = int(val[1:])
    if value_symbol == '-':
        return -num
    else:
        return num


converters = [str, min_to_sec, int_conv, int_conv, float_conv, int_conv, int_conv, float_conv,
              int_conv, int_conv, float_conv, int_conv, int_conv, int_conv, int_conv, int_conv,
              int_conv, int_conv, int_conv, int_conv, plus_minus_conv]


directory = 'Collecting_Data/game_html'
useful_away_tables = {3: 'Q1', 4: 'Q2', 5: 'H1', 6: 'Q3', 7: 'Q4', 8: 'H2'}
useful_home_tables = {11: 'Q1', 12: 'Q2', 13: 'H1', 14: 'Q3', 15: 'Q4', 16: 'H2'}
headers = ['Starters', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
           'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-']

Don't use next cell

In [None]:
for filename in os.listdir(directory):
    print(filename)

    soup = BeautifulSoup(open(f'/Users/chrisjeff/Desktop/Senior-Project/Senior_project/Temp_Data/{filename}'), 'html.parser')
    tables = soup.find_all('table', class_='stats_table')

    player_data = []
    amount_of_players = 0
    
    for _, q in enumerate(useful_away_tables):
        #for head in tables[q].find('thead').find_all('tr', attrs={'class': None}):
        #    headers.append(head.find('th').text)
        for row in tables[q].find('tbody').find_all('tr', attrs={'class': None}):
            player_name = row.find('th').text
            #names.append(player_name)
            data_row = ','.join(e.text for e in row.children)
            player_data.append(data_row)
                
    quarter_index = 0
    amount_of_players = len(player_data) // 6

    with open('cleaned_data.csv', 'a') as f_object:
        writer_object = writer(f_object)
        writer_object.writerow(headers)
        f_object.close()
        
    for n in player_data:
        if quarter_index % amount_of_players == 0:
            print('end of term')
        quarter_index += 1

        row = n.split(',')
        results = []
        if len(row) > 2:
            for i, func in enumerate(converters):
                converted_val = func(row[i])
                results.append(converted_val)
        else:
            row[1] = None
            results.append(row[0])
            results.append(row[1])

        print(results)
        with open('cleaned_data.csv', 'a') as f_object:
            writer_object = writer(f_object)
            writer_object.writerow(results)
            f_object.close()


#0, 2, 5, 9, 14, 20   :   Indices of tables that matter
#Get player info for each table

Writes stats for every player in every game, separated by quarter, to a .csv format with correct data types

In [45]:
for file_number, filename in enumerate(os.listdir(directory)):
    new_file = f'cleaned_data_game_{file_number+1}.csv'

    if os.path.isfile(f'{curr_dir}/Cleaned_data/{new_file}'):
        os.remove(f'{curr_dir}/Cleaned_data/{new_file}')
    print(filename)

    soup = BeautifulSoup(open(f'{curr_dir}/{directory}/{filename}'), 'html.parser')
    tables = soup.find_all('table', class_='sortable stats_table')

    #print(len(tables))

    amount_of_players = len(soup.find('tbody').find_all('tr', attrs={'class': None}))
    player_data = []
    
    #teamo_num == 0 is away team, team_num == 1 is home team
    for team_num in range(2):
        for player_number in range(len(tables[team_num*6])):
            for index in range((len(tables))//2):
                quarter_stats = tables[index + (team_num*6)].find('tbody').find_all('tr', attrs={'class': None})[player_number]
                #print(quarter_stats)
                data_row = ','.join(e.text for e in quarter_stats.children)
                player_data.append(data_row)


    #print(player_data)

    
    with open(f'{curr_dir}/Cleaned_data/{new_file}', 'w') as f_object: #Write headers to every file needed
        writer_object = writer(f_object)
        writer_object.writerow(headers)
        f_object.close()

    
    quarter_index = 0

    for i, n in enumerate(player_data):  #separates stats for each player from every term
        if (quarter_index) % ((len(tables) // 2)) == 0:
            print('end of player stats')
        quarter_index += 1
    
        results = []
        row = n.split(',')
        if len(row) > 2:
            for i, func in enumerate(converters): #Have player_data get converted from str to appropriate data type
                converted_val = func(row[i])
                results.append(converted_val)
        else:
            row[1] = None #Data cell is empty (hasn't played)
            results.append(row[0])
            results.append(row[1])

        print(results)
        with open(f'{curr_dir}/Cleaned_data/{new_file}', 'a') as f_object: #writes player data to new file
            writer_object = writer(f_object)
            writer_object.writerow(results)
            f_object.close()
    


#0, 2, 5, 9, 14, 20   :   Indices of tables that matter
#Get player info for each tabl

200303300SEA.html
12
end of player stats
['Kobe Bryant', 720, 1, 2, 0.5, 1, 1, 1.0, 0, 0, None, 0, 0, 0, 2, 3, 0, 0, 1, 3, -1]
['Kobe Bryant', 592, 0, 3, 0.0, 0, 1, 0.0, 0, 0, None, 0, 1, 1, 1, 0, 0, 3, 2, 0, -18]
['Kobe Bryant', 1312, 1, 5, 0.2, 1, 2, 0.5, 0, 0, None, 0, 1, 1, 3, 3, 0, 3, 3, 3, -19]
['Kobe Bryant', 720, 4, 7, 0.571, 3, 5, 0.6, 0, 0, None, 1, 0, 1, 2, 0, 0, 0, 1, 11, -1]
['Kobe Bryant', 193, 0, 1, 0.0, 0, 1, 0.0, 0, 0, None, 0, 0, 0, 0, 0, 0, 1, 0, 0, -1]
['Kobe Bryant', 913, 4, 8, 0.5, 3, 6, 0.5, 0, 0, None, 1, 0, 1, 2, 0, 0, 1, 1, 11, -2]
end of player stats
["Shaquille O'Neal", 707, 6, 6, 1.0, 0, 0, None, 1, 1, 1.0, 0, 0, 0, 0, 1, 0, 0, 0, 13, -1]
["Shaquille O'Neal", 495, 0, 1, 0.0, 0, 0, None, 8, 12, 0.667, 1, 1, 2, 0, 0, 1, 1, 2, 8, -14]
["Shaquille O'Neal", 1202, 6, 7, 0.857, 0, 0, None, 9, 13, 0.692, 1, 1, 2, 0, 1, 1, 1, 2, 21, -15]
["Shaquille O'Neal", 558, 4, 4, 1.0, 0, 0, None, 2, 3, 0.667, 0, 3, 3, 0, 0, 0, 2, 0, 10, -4]
["Shaquille O'Neal", 175, 1, 1, 1.0,

KeyboardInterrupt: 