Libraries

In [1]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import re
import time
from csv import writer, reader
from decimal import *
import pandas as pd
from tqdm import tqdm, trange

Global variables

In [2]:
SEASONS = list(range(1996, 2023))
months = ['november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']

In [3]:
DATA_DIR = "testing_data_collection"
STANDINGS_DIR = os.path.join(DATA_DIR, "games")
curr_dir = '/Users/chrisjeff/Desktop/Origin'
#SCORES_DIR = os.path.join(DATA_DIR, "scores")

Function that grabs specified html content from a url

In [4]:
#get html function
async def get_html(url, selector, sleep=5, retries=10, timeout=0):
  html = None
  for i in range(1, retries+1):
    time.sleep(sleep * i)

    try:
      async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url, timeout=30000) #if can't load page in 30s, timeout error gets thrown
        print(url)
        html = await page.inner_html(selector,timeout=60000) #grab all html with identifier named {selector} within 60s
        print('got html')
    except PlaywrightTimeout:
      print(f"Timeout on {url}")
      if i == retries: #timesout for 5th time
        print('No games this month')
      continue
    else:
      break
  return html

Grabs html page containing all player names, positions, etc.

In [None]:
for season in SEASONS:
  url = f'https://basketball.realgm.com/nba/players/{season}'
  html = await get_html(url, 'table', timeout=60000)
  print(f'collected data from season {season}')

  soup = BeautifulSoup(html, 'html.parser')

  name_html = soup.find_all('a')
  pos_html = soup.find_all('td', attrs={'data-th': 'Pos'})

  names = []
  positions = []

  for row in name_html:
    names.append(row.text)
  print(len(names))
  print(names)

  for row in pos_html:
    positions.append(row.text)
  print(len(positions))
  print(positions)
'''
  with open(f'{curr_dir}/indv_player_data/season_{season}_players.csv', 'w') as f_object:
    writer_object = writer(f_object)
    assert len(names) == len(positions)
    for i in range(len(names)):
      writer_object.writerow(names[i])
      writer_object.writerow(positions[i])
    f_object.close()
'''

Gets links for stats to every game between a range of months, between a range of years

In [9]:
for season in SEASONS:
    for month in months:
        file_of_links = f'games_from_{month}-{season}.csv'
        if file_of_links in os.listdir(f'{curr_dir}/Collecting_Data/Links_to_games'):
            continue
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html'
        html = await get_html(url, '#schedule') #goes to url and returns all html
        if not html: #if html=none then a timeout occured, multiple times
            continue

        soup = BeautifulSoup(html, 'html.parser')
        row = soup.find_all('a')
        links = [_['href'] for _ in row] #grabs all links on a page
        all_links = [f"https://basketball-reference.com{game}" for game in links]

        indiv_game_boxscores = [] 
        for url in all_links:
            if 'boxscores' in url and url[-5::] == '.html':
                indiv_game_boxscores.append(url)
        print(indiv_game_boxscores) #list of all links to indiv games
        with open(f'{curr_dir}/Collecting_Data/Links_to_games/{file_of_links}', 'w') as f_object:
            writer_object = writer(f_object)
            writer_object.writerow(indiv_game_boxscores)
            f_object.close()


https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
https://www.basketball-reference.com/leagues/NBA_1981_games-june.html
Timeout on https://www.b

Grabs the tables from a given web page

In [5]:
async def get_game_html_data(url):
    html = await get_html(url, '#content') #grabs div with id named content 
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', class_='sortable stats_table') #tables is a list of html <table>
    return tables
    

Collect HTML for indv games

In [16]:
#Should be over 32881 game_html files
get_data = f'{curr_dir}/Collecting_Data' #directory of where collected data is put
html_game_dir = f'{get_data}/game_html' #directory of where html of games are put
for file_number, file_name in tqdm(enumerate(os.listdir(f'{get_data}/Links_to_games')), desc='Through Files'):
  print(file_name)
  all_links = []
  with open(f'{get_data}/Links_to_games/{file_name}', 'r') as file_object:
    csv_file = reader(file_object)

    all_links = list(csv_file)[0] #all_links is a list of links to games for a given month

  for single_link in tqdm(all_links, desc='Links in file finished'):
    new_file = single_link[-17::] #sets name of new_file to year/month/day/team of a game
    if new_file in os.listdir(f'{html_game_dir}'):
      continue
    with open(f'{html_game_dir}/{new_file}', 'w') as f_object:
      html = await get_game_html_data(single_link)
      #if not html: #if timeout occurs, file gets deleted
        #os.remove(f'{html_game_dir}{new_file}')
        #continue
      f_object.write(str(html))
      f_object.close()

Through Files: 0it [00:00, ?it/s]

games_from_march-2004.csv






https://basketball-reference.com/boxscores/200403200CHI.html
got html


Links in file finished: 100%|██████████| 224/224 [00:37<00:00,  6.02it/s]
Through Files: 1it [00:37, 37.21s/it]

games_from_march-2010.csv


Links in file finished: 100%|██████████| 235/235 [00:11<00:00, 19.84it/s]
Through Files: 2it [00:49, 22.30s/it]

games_from_may-2007.csv


Links in file finished: 100%|██████████| 41/41 [00:02<00:00, 18.77it/s]
Through Files: 3it [00:51, 13.12s/it]

games_from_may-2013.csv


Links in file finished: 100%|██████████| 41/41 [00:02<00:00, 17.92it/s]
Through Files: 4it [00:53,  8.86s/it]

games_from_november-2022.csv


Links in file finished: 100%|██████████| 225/225 [00:17<00:00, 12.94it/s]
Through Files: 5it [01:11, 11.96s/it]

games_from_april-2007.csv


Links in file finished: 100%|██████████| 177/177 [00:15<00:00, 11.69it/s]
Through Files: 6it [01:26, 13.05s/it]

games_from_april-2013.csv


Links in file finished: 100%|██████████| 165/165 [00:12<00:00, 12.85it/s]
Through Files: 7it [01:39, 12.99s/it]

games_from_june-1999.csv


Links in file finished: 100%|██████████| 12/12 [00:00<00:00, 16.76it/s]
Through Files: 8it [01:39,  9.09s/it]

games_from_june-1998.csv


Links in file finished: 100%|██████████| 6/6 [00:00<00:00, 16.40it/s]
Through Files: 9it [01:40,  6.37s/it]

games_from_april-2012.csv


Links in file finished: 100%|██████████| 225/225 [00:14<00:00, 15.08it/s]
Through Files: 10it [01:55,  9.02s/it]

games_from_april-2006.csv




Functions that will convert str data type into correct, useful data types

In [6]:
def is_home_team(num):
    return int(num)

def get_term(num):
    return terms[int(num)]

def min_to_sec(time):        #converts time to an integer
    if time == 'Did Not Play' or time == 'Did Not Dress' or time is None or time == '\xa0':
        return 0
    
    time_list = time.split(':')
    return int(time_list[0]) * 60 + int(time_list[1])

def int_conv(num):           #converts number to int
    if num == '':
        return None
    
    return int(num)

def float_conv(num):       #converts number to float to nearest 1000th place, rounding down
    if num == '':
        return None
    
    return float(Decimal(str(num)).quantize(Decimal('.001'), rounding=ROUND_DOWN))

def plus_minus_conv(val):
    if val == '':
        return None
    if val == '0':
        return 0
    
    value_symbol = val[0]
    num = int(val[1:])
    if value_symbol == '-':
        return -num
    else:
        return num


converters = [is_home_team, get_term, str, min_to_sec, int_conv, int_conv, float_conv, int_conv, int_conv, float_conv,
              int_conv, int_conv, float_conv, int_conv, int_conv, int_conv, int_conv, int_conv,
              int_conv, int_conv, int_conv, int_conv, plus_minus_conv]


directory = 'Collecting_Data/game_html'
terms = ['Q1', 'Q2', 'H1', 'Q3', 'Q4', 'H2']
headers = ['Home Team', 'Term', 'Name', 'Time Played (sec)', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
           'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-']

Writes stats for every player in every game, separated by quarter, to a .csv format with correct data types

In [13]:
for filename in tqdm(os.listdir(directory), desc='files cleaned'):
    print(filename)
    if f'cleaned_data_{filename}.csv' in os.listdir(f'{curr_dir}/Cleaned_data'):
        continue
    new_file = f'cleaned_data_{filename[:-5]}.csv'
    
    '''
    if os.path.isfile(f'{curr_dir}/Cleaned_data/{new_file}'):
        os.remove(f'{curr_dir}/Cleaned_data/{new_file}')
    '''

    soup = BeautifulSoup(open(f'{curr_dir}/{directory}/{filename}'), 'html.parser')
    #creates a list of stats per quarter for each team
    tables = soup.find_all('table', class_='sortable stats_table')

    found_regex = [] # contains the id of q1 tables to get the amount of players
    regex = r'box-[A-Z]{3}-q1-basic' #expression used to search for specific tables

    for table in tables: #grabs q1 table for each team to identify number of players
        if re.search(regex, table['id']) and table['id'] not in found_regex:
            found_regex.append(table['id'])



    #will contain all data for every player
    player_data = []
    temp = soup.find_all('table', id= regex)

    #teamo_num == 0 is away team, team_num == 1 is home team
    for team_num in trange(2, desc='Teams finished'):
        #grab number of players currently on the team
        #subtract 4 because there are two rows of headers for each table, 2 rows * 2 teams = 4 extra rows
        amount_of_players = len(soup.find('table', attrs={'id': found_regex[team_num]}).find_all('tr')) - 4
        
        for player_number in range(amount_of_players): #loops through every player on the team
            for index in range(6): #gets all stats for every term for that player
                quarter_stats = tables[index + (team_num*(len(tables)//2))].find('tbody').find_all('tr', attrs={'class': None})[player_number]
                #print(quarter_stats)
                data_row = f'{is_home_team(team_num)}, {index},' + ','.join(e.text for e in quarter_stats.children)
                player_data.append(data_row)

    
    with open(f'{curr_dir}/Cleaned_data/{new_file}', 'w') as f_object: #Write headers to every file needed
        writer_object = writer(f_object)
        writer_object.writerow(headers)
        f_object.close()

    
    quarter_index = 0

    for i, n in enumerate(player_data):  #separates stats for each player from every term
        '''
        if (quarter_index) % ((len(tables) // 2)) == 0:
            print('end of player stats')
        quarter_index += 1
        '''
        results = []
        row = n.split(',')
        
        for i, func in enumerate(converters): #converts stats to correct type of data
            if len(row) > 4:
                converted_val = func(row[i])
                results.append(converted_val)
            else:
                results.append(is_home_team(row[0]))
                results.append(get_term(row[1]))
                results.append(row[2])
                results.append(None)
                break

        #print(results)
        with open(f'{curr_dir}/Cleaned_data/{new_file}', 'a') as f_object: #writes player data to new file
            writer_object = writer(f_object)
            writer_object.writerow(results)
            f_object.close()

201604180GSW.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  4.11it/s]
files cleaned:   3%|▎         | 1021/32882 [11:48<7:57:58,  1.11it/s]

201012030TOR.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 10.16it/s]
files cleaned:   3%|▎         | 1022/32882 [11:49<8:13:58,  1.07it/s]

201301280BRK.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.08it/s]
files cleaned:   3%|▎         | 1023/32882 [11:49<7:34:21,  1.17it/s]

201803040ATL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.18it/s]
files cleaned:   3%|▎         | 1024/32882 [11:50<6:52:57,  1.29it/s]

202001240NYK.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.00it/s]
files cleaned:   3%|▎         | 1025/32882 [11:51<6:50:39,  1.29it/s]

201903130MIA.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  6.98it/s]
files cleaned:   3%|▎         | 1026/32882 [11:51<6:37:23,  1.34it/s]

200002210POR.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.18it/s]
files cleaned:   3%|▎         | 1027/32882 [11:52<5:56:57,  1.49it/s]

201612140HOU.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.46it/s]
files cleaned:   3%|▎         | 1028/32882 [11:53<6:06:45,  1.45it/s]

201611270PHO.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  4.20it/s]
files cleaned:   3%|▎         | 1029/32882 [11:54<6:31:05,  1.36it/s]

199802030GSW.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.31it/s]
files cleaned:   3%|▎         | 1030/32882 [11:54<6:04:58,  1.45it/s]

201911260DEN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.67it/s]
files cleaned:   3%|▎         | 1031/32882 [11:55<6:00:40,  1.47it/s]

199612300VAN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.02it/s]
files cleaned:   3%|▎         | 1032/32882 [11:55<5:24:47,  1.63it/s]

200704040DET.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.32it/s]
files cleaned:   3%|▎         | 1033/32882 [11:56<5:14:44,  1.69it/s]

201203100MIN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.76it/s]
files cleaned:   3%|▎         | 1034/32882 [11:56<5:12:00,  1.70it/s]

200002170MIL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 10.51it/s]
files cleaned:   3%|▎         | 1035/32882 [11:57<5:23:57,  1.64it/s]

200912140LAC.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 10.62it/s]
files cleaned:   3%|▎         | 1036/32882 [11:58<5:03:50,  1.75it/s]

201201010DEN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  4.82it/s]
files cleaned:   3%|▎         | 1037/32882 [11:58<6:00:00,  1.47it/s]

200111150HOU.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.00it/s]
files cleaned:   3%|▎         | 1038/32882 [11:59<5:58:32,  1.48it/s]

199801290VAN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 10.50it/s]
files cleaned:   3%|▎         | 1039/32882 [12:00<5:29:28,  1.61it/s]

200212080SAS.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.89it/s]
files cleaned:   3%|▎         | 1040/32882 [12:00<5:04:37,  1.74it/s]

200203090DEN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.46it/s]
files cleaned:   3%|▎         | 1041/32882 [12:01<5:20:36,  1.66it/s]

199706060UTA.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.36it/s]
files cleaned:   3%|▎         | 1042/32882 [12:01<5:16:56,  1.67it/s]

200711290LAL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.12it/s]
files cleaned:   3%|▎         | 1043/32882 [12:02<4:55:30,  1.80it/s]

199711160SEA.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.56it/s]
files cleaned:   3%|▎         | 1044/32882 [12:02<5:07:04,  1.73it/s]

200803050HOU.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  3.56it/s]
files cleaned:   3%|▎         | 1045/32882 [12:03<5:59:33,  1.48it/s]

201202280MIL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.97it/s]
files cleaned:   3%|▎         | 1046/32882 [12:04<5:36:31,  1.58it/s]

201204190NOH.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.09it/s]
files cleaned:   3%|▎         | 1047/32882 [12:05<5:43:26,  1.54it/s]

201701160MIL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.29it/s]
files cleaned:   3%|▎         | 1048/32882 [12:05<5:25:15,  1.63it/s]

200501120GSW.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.68it/s]
files cleaned:   3%|▎         | 1049/32882 [12:06<5:01:11,  1.76it/s]

199702230HOU.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 12.64it/s]
files cleaned:   3%|▎         | 1050/32882 [12:06<5:04:53,  1.74it/s]

201701140WAS.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.50it/s]
files cleaned:   3%|▎         | 1051/32882 [12:07<4:55:03,  1.80it/s]

202301130DET.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.24it/s]
files cleaned:   3%|▎         | 1052/32882 [12:07<4:42:35,  1.88it/s]

199701100PHO.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  2.78it/s]
files cleaned:   3%|▎         | 1053/32882 [12:08<6:25:11,  1.38it/s]

201012110DAL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 10.53it/s]
files cleaned:   3%|▎         | 1054/32882 [12:09<5:52:58,  1.50it/s]

201312180PHO.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.72it/s]
files cleaned:   3%|▎         | 1055/32882 [12:09<5:39:13,  1.56it/s]

199612200CLE.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.47it/s]
files cleaned:   3%|▎         | 1056/32882 [12:10<5:37:02,  1.57it/s]

200502010DEN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.42it/s]
files cleaned:   3%|▎         | 1057/32882 [12:11<5:21:25,  1.65it/s]

201202060NYK.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.98it/s]
files cleaned:   3%|▎         | 1058/32882 [12:11<5:03:12,  1.75it/s]

201011020MIA.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.14it/s]
files cleaned:   3%|▎         | 1059/32882 [12:12<5:20:42,  1.65it/s]

200702270MIN.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  9.45it/s]
files cleaned:   3%|▎         | 1060/32882 [12:12<5:02:04,  1.76it/s]

200501280MIL.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  3.31it/s]
files cleaned:   3%|▎         | 1061/32882 [12:13<6:04:15,  1.46it/s]

200311300SAC.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  7.10it/s]
files cleaned:   3%|▎         | 1062/32882 [12:14<5:48:34,  1.52it/s]

201712180IND.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.80it/s]
files cleaned:   3%|▎         | 1063/32882 [12:14<5:48:43,  1.52it/s]

201202120GSW.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.09it/s]
files cleaned:   3%|▎         | 1064/32882 [12:15<5:21:34,  1.65it/s]

202112080HOU.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 10.65it/s]
files cleaned:   3%|▎         | 1065/32882 [12:15<5:03:08,  1.75it/s]

200502280CLE.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00, 11.31it/s]
files cleaned:   3%|▎         | 1066/32882 [12:16<5:23:38,  1.64it/s]

199705090MIA.html


Teams finished: 100%|██████████| 2/2 [00:00<00:00,  8.54it/s]
files cleaned:   3%|▎         | 1067/32882 [12:17<5:51:42,  1.51it/s]

199803010MIN.html


files cleaned:   3%|▎         | 1067/32882 [12:18<6:06:45,  1.45it/s]


KeyboardInterrupt: 