Libraries

In [1]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import re
import time
from csv import writer, reader
from decimal import *
import pandas as pd
from tqdm import tqdm, trange

Global variables

In [2]:
SEASONS = list(range(1996, 2023))
months = ['november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']

In [3]:
DATA_DIR = "testing_data_collection"
STANDINGS_DIR = os.path.join(DATA_DIR, "games")
curr_dir = '/Users/chrisjeff/Desktop/Origin'
#SCORES_DIR = os.path.join(DATA_DIR, "scores")

Function that grabs specified html content from a url

In [4]:
#get html function
async def get_html(url, selector, sleep=5, retries=10, timeout=0):
  html = None
  for i in range(1, retries+1):
    time.sleep(sleep * i)

    try:
      async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url, timeout=30000) #if can't load page in 30s, timeout error gets thrown
        print(url)
        html = await page.inner_html(selector,timeout=60000) #grab all html with identifier named {selector} within 60s
        print('got html')
    except PlaywrightTimeout:
      print(f"Timeout on {url}")
      if i == retries: #timesout for 5th time
        print('No games this month')
      continue
    else:
      break
  return html

Grabs html page containing all player names, positions, etc.

In [None]:
for season in SEASONS:
  url = f'https://basketball.realgm.com/nba/players/{season}'
  html = await get_html(url, 'table', timeout=60000)
  print(f'collected data from season {season}')

  soup = BeautifulSoup(html, 'html.parser')

  name_html = soup.find_all('a')
  pos_html = soup.find_all('td', attrs={'data-th': 'Pos'})

  names = []
  positions = []

  for row in name_html:
    names.append(row.text)
  print(len(names))
  print(names)

  for row in pos_html:
    positions.append(row.text)
  print(len(positions))
  print(positions)
'''
  with open(f'{curr_dir}/indv_player_data/season_{season}_players.csv', 'w') as f_object:
    writer_object = writer(f_object)
    assert len(names) == len(positions)
    for i in range(len(names)):
      writer_object.writerow(names[i])
      writer_object.writerow(positions[i])
    f_object.close()
'''

Gets links for stats to every game between a range of months, between a range of years

In [None]:
for season in SEASONS:
    for month in months:
        file_of_links = f'games_from_{month}-{season}.csv'
        if file_of_links in os.listdir(f'{curr_dir}/Collecting_Data/Links_to_games'):
            continue
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html'
        html = await get_html(url, '#schedule') #goes to url and returns all html
        if not html: #if html=none then a timeout occured, multiple times
            continue

        soup = BeautifulSoup(html, 'html.parser')
        row = soup.find_all('a')
        links = [_['href'] for _ in row] #grabs all links on a page
        all_links = [f"https://basketball-reference.com{game}" for game in links]

        indiv_game_boxscores = [] 
        for url in all_links:
            if 'boxscores' in url and url[-5::] == '.html':
                indiv_game_boxscores.append(url)
        print(indiv_game_boxscores) #list of all links to indiv games
        with open(f'{curr_dir}/Collecting_Data/Links_to_games/{file_of_links}', 'w') as f_object:
            writer_object = writer(f_object)
            writer_object.writerow(indiv_game_boxscores)
            f_object.close()


Grabs the tables from a given web page

In [5]:
async def get_game_html_data(url):
    html = await get_html(url, '#content') #grabs div with id named content 
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', class_='sortable stats_table') #tables is a list of html <table>
    return tables
    

Collect HTML for indv games

In [None]:
#Should be over 32881 game_html files
get_data = f'{curr_dir}/Collecting_Data' #directory of where collected data is put
html_game_dir = f'{get_data}/game_html' #directory of where html of games are put
for file_name in tqdm(os.listdir(f'{get_data}/Links_to_games'), desc='Through Files'):
  print(file_name)
  all_links = []
  with open(f'{get_data}/Links_to_games/{file_name}', 'r') as file_object:
    csv_file = reader(file_object)

    all_links = list(csv_file)[0] #all_links is a list of links to games for a given month

  for single_link in tqdm(all_links, desc='Links in file finished'):
    new_file = single_link[-17::] #sets name of new_file to year/month/day/team of a game
    if new_file in os.listdir(f'{html_game_dir}'):
      continue
    with open(f'{html_game_dir}/{new_file}', 'w') as f_object:
      html = await get_game_html_data(single_link)
      #if not html: #if timeout occurs, file gets deleted
        #os.remove(f'{html_game_dir}{new_file}')
        #continue
      f_object.write(str(html))
      f_object.close()

Functions that will convert str data type into correct, useful data types

In [6]:
def is_home_team(num):
    return int(num)

def get_term(num):
    return terms[int(num)]

def min_to_sec(time):        #converts time to an integer
    if time == 'Did Not Play' or time == 'Did Not Dress' or time is None or time == '\xa0' or time == '':
        return 0
    
    time_list = time.split(':')
    return int(time_list[0]) * 60 + int(time_list[1])

def int_conv(num):           #converts number to int
    if num == '':
        return None
    
    return int(num)

def float_conv(num):       #converts number to float to nearest 1000th place, rounding down
    if num == '':
        return None
    
    return float(Decimal(str(num)).quantize(Decimal('.001'), rounding=ROUND_DOWN))

def plus_minus_conv(val):
    if val == '':
        return None
    if val == '0':
        return 0
    
    value_symbol = val[0]
    num = int(val[1:])
    if value_symbol == '-':
        return -num
    else:
        return num


converters = [is_home_team, get_term, str, min_to_sec, int_conv, int_conv, float_conv, int_conv, int_conv, float_conv,
              int_conv, int_conv, float_conv, int_conv, int_conv, int_conv, int_conv, int_conv,
              int_conv, int_conv, int_conv, int_conv, plus_minus_conv]


directory = 'Collecting_Data/game_html'
terms = ['Q1', 'Q2', 'H1', 'Q3', 'Q4', 'H2']
headers = ['Home Team', 'Term', 'Name', 'Time Played (sec)', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
           'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-']

Writes stats for every player in every game, separated by quarter, to a .csv format with correct data types

In [9]:
for filename in tqdm(os.listdir(directory), desc='Games Cleaned'):
    print(filename)
    '''
    if f'cleaned_data_{filename}' in os.listdir(f'{curr_dir}/Cleaned_data'):
        continue
    '''
    
    new_file = f'cleaned_data_{filename[:-5]}.csv'
    
    if os.path.isfile(f'{curr_dir}/Cleaned_data/{new_file}'):
        continue
    
    soup = BeautifulSoup(open(f'{curr_dir}/{directory}/{filename}'), 'html.parser')
    #creates a list of stats per quarter for each team
    tables = soup.find_all('table', class_='sortable stats_table')

    found_regex = [] # contains the id of q1 tables to get the amount of players
    regex = r'box-[A-Z]{3}-q1-basic' #expression used to search for specific tables

    for table in tables: #grabs q1 table for each team to identify number of players
        if re.search(regex, table['id']) and table['id'] not in found_regex:
            found_regex.append(table['id'])



    #will contain all data for every player
    player_data = []
    temp = soup.find_all('table', id= regex)

    #teamo_num == 0 is away team, team_num == 1 is home team
    for team_num in range(2):
        #grab number of players currently on the team
        #subtract 4 because there are two rows of headers for each table, 2 rows * 2 teams = 4 extra rows
        amount_of_players = len(soup.find('table', attrs={'id': found_regex[team_num]}).find_all('tr')) - 4
        
        for player_number in range(amount_of_players): #loops through every player on the team
            for index in range(6): #gets all stats for every term for that player
                quarter_stats = tables[index + (team_num*(len(tables)//2))].find('tbody').find_all('tr', attrs={'class': None})[player_number]
                #print(quarter_stats)
                data_row = f'{is_home_team(team_num)}, {index},' + ','.join(e.text for e in quarter_stats.children)
                player_data.append(data_row)

    
    with open(f'{curr_dir}/Cleaned_data/{new_file}', 'w') as f_object: #Write headers to every file needed
        writer_object = writer(f_object)
        writer_object.writerow(headers)
        f_object.close()

    
    #print(player_data)
    quarter_index = 0

    for i, n in enumerate(player_data):  #separates stats for each player from every term
        '''
        if (quarter_index) % ((len(tables) // 2)) == 0:
            print('end of player stats')
        quarter_index += 1
        '''
        results = []
        row = n.split(',')
        
        for i, func in enumerate(converters): #converts stats to correct type of data
            if len(row) > 4:
                converted_val = func(row[i])
                results.append(converted_val)
            else:
                results.append(is_home_team(row[0]))
                results.append(get_term(row[1]))
                results.append(row[2])
                results.append(None)
                break

        #print(results)
        with open(f'{curr_dir}/Cleaned_data/{new_file}', 'a') as f_object: #writes player data to new file
            writer_object = writer(f_object)
            writer_object.writerow(results)
            f_object.close()

Games Cleaned:  20%|██        | 6595/32881 [00:00<00:00, 34297.92it/s]

200611280SAC.html
200303300SEA.html
200104070WAS.html
202211070MIA.html
201011240TOR.html
200804090TOR.html
200201050CHH.html
202212140DAL.html
201011100SAS.html
201512090DAL.html
202212070TOR.html
200301280DET.html
201201200NYK.html
200901080SAS.html
200404120DEN.html
199612280NYK.html
199904290WAS.html
202104300MEM.html
200211090DAL.html
200203300POR.html
200203060MIL.html
199703160SAC.html
201201210NYK.html
200901160LAL.html
200203290NYK.html
200802130CHA.html
201703080MIN.html
201903260DAL.html
201304140NYK.html
200804230LAL.html
199905130MIN.html
201503070CLE.html
200912180MIN.html
199705210UTA.html
201101170DET.html
201011030NJN.html
202212090UTA.html
199712210BOS.html
201511270CHO.html
200803110PHO.html
201112290SAC.html
202201230MIN.html
199801290POR.html
200012150PHI.html
200203050WAS.html
199701040HOU.html
202001160NOP.html
202203180PHO.html
199611050PHO.html
201501010CHI.html
200004120PHI.html
202102190ORL.html
201504130UTA.html
201502250SAC.html
201611030MIL.html
201202190O

Games Cleaned:  30%|███       | 10025/32881 [00:00<00:00, 31578.27it/s]

201403240OKC.html
200803220NOH.html
202002250IND.html
201003300IND.html
201501130UTA.html
200604170TOR.html
200701220TOR.html
200511260NYK.html
200102060PHO.html
199903130ATL.html
200802080DET.html
200604040DAL.html
200303020HOU.html
201104160MIA.html
201002080LAL.html
200701050NJN.html
201001230MIA.html
200903270CLE.html
201205080CHI.html
200512210MEM.html
201303190SAC.html
201611110BOS.html
201201070IND.html
200605040IND.html
200604190UTA.html
200512090CLE.html
200701300DAL.html
200701170SAS.html
201306200MIA.html
200604220SAS.html
201601130OKC.html
201902010NYK.html
200004010HOU.html
201502210CHO.html
201305140SAS.html
200812130CHI.html
200103270SAC.html
202003020SAS.html
201802090BOS.html
199712020HOU.html
201512070CHI.html
202101310DEN.html
200103060CHH.html
199804280POR.html
202102180MIL.html
200112210PHI.html
200405200DET.html
200305020NOH.html
199612050SEA.html
199701170PHI.html
200903250PHO.html
200603240DEN.html
201103230NYK.html
201001230DET.html
200811070LAC.html
201304280B

Games Cleaned:  49%|████▊     | 15954/32881 [00:00<00:00, 26092.07it/s]

202204240CHI.html
201811150HOU.html
201702090ORL.html
200304110MIA.html
201103050HOU.html
201112280DET.html
200101140MIA.html
201403090LAL.html
200604050PHO.html
200304080HOU.html
202104140CHI.html
200804290NOH.html
201101070ORL.html
201005020LAL.html
200904040DEN.html
201412170DEN.html
201403160SAS.html
200702030HOU.html
200204050IND.html
200801300MEM.html
201604090CHI.html
202111070LAC.html
202201070OKC.html
200212180BOS.html
201302250UTA.html
201412160MEM.html
199711090LAL.html
200912280PHO.html
201704150SAS.html
200012260DET.html
201411100NYK.html
201811300CHO.html
200805030DET.html
200802220GSW.html
202102090MIA.html
201101240PHI.html
199903210ORL.html
201903110BRK.html
201511180BOS.html
201003100SAC.html
201805030BOS.html
200001180SAC.html
202302250ORL.html
201403230TOR.html
201405010ATL.html
200102150TOR.html
199703090POR.html
200602250PHI.html
201303130PHI.html
201612300MIN.html
199802130CHI.html
201411090POR.html
199903290IND.html
201011050NOH.html
199903030PHI.html
200102210S

Games Cleaned:  59%|█████▉    | 19509/32881 [00:00<00:00, 28503.21it/s]

199803100MIL.html
201511060IND.html
199803260POR.html
201101100BOS.html
201012120PHI.html
201412120TOR.html
199701090ORL.html
200401060CLE.html
200402040WAS.html
201812080MEM.html
200711200SAC.html
201201200BOS.html
201411120MIA.html
202111010PHI.html
200801270DAL.html
199801300BOS.html
201301040WAS.html
201302060CLE.html
201011040CHI.html
201601100MEM.html
200212050ATL.html
202102250MIL.html
201912210BRK.html
199912110ORL.html
200004230NYK.html
200903180PHO.html
201101220LAC.html
200711020CHA.html
200312120MIA.html
199902160CLE.html
202101250CLE.html
201502200DET.html
200511050HOU.html
200311010DAL.html
202102270WAS.html
200311260SAS.html
199904040BOS.html
202104100BRK.html
199904140SEA.html
200304140DET.html
202211190ATL.html
201012030CHA.html
200402070MIL.html
199804180GSW.html
200201210DET.html
201212040HOU.html
200303210NYK.html
201803160LAL.html
201712090PHO.html
200904130DAL.html
201811210MIL.html
201511040ATL.html
200203290BOS.html
200612060LAL.html
201204040POR.html
202203130B

Games Cleaned:  76%|███████▌  | 24912/32881 [00:00<00:00, 22945.71it/s]

201302080UTA.html
200203030SAC.html
201303150IND.html
200312290GSW.html
201812210TOR.html
199912080CHH.html
200603170NJN.html
201905080MIL.html
201602050CHO.html
200504060SAS.html
200402070MIA.html
201311220MIN.html
201812150SAS.html
199903050IND.html
200904290DEN.html
199902180UTA.html
201703150PHO.html
200401140DAL.html
200804040NOH.html
200411080LAC.html
199711010UTA.html
201201280CHA.html
200312120MIL.html
201301060MIA.html
200603230DAL.html
200603040SAS.html
200312100WAS.html
200104010SAC.html
201302060TOR.html
201403250LAL.html
201803150DEN.html
200901300NOH.html
201404030OKC.html
201011280NOH.html
201811200MIA.html
200204030PHI.html
200611160GSW.html
200711100IND.html
200911040WAS.html
200602280SEA.html
200301060BOS.html
200212040PHI.html
199906230NYK.html
199802010MIN.html
199912280SAC.html
200503010DEN.html
201511270ORL.html
201301060DET.html
200312230HOU.html
201502200MIL.html
201202110MIN.html
200705050DET.html
200703270MIN.html
200003160MIL.html
202105080POR.html
200103300B

Games Cleaned: 100%|██████████| 32881/32881 [00:01<00:00, 27065.83it/s]

201703010LAC.html
201911220DEN.html
201503130POR.html
200304290MIN.html
201612100HOU.html
201903170MIA.html
201503270WAS.html
201911230MEM.html
201211230IND.html
199711130SEA.html
201002090LAC.html
201001220MIN.html
201701130MIL.html
200612290CHA.html
201701250POR.html
201201040DEN.html
200405210MIN.html
200303210NOH.html
199904080CLE.html
202203160NYK.html
200903270PHI.html
200711070TOR.html
202202010MIN.html
201012080UTA.html
200512090PHI.html
200501170GSW.html
200801090ATL.html
202103210DET.html
202211110SAS.html
200202270CHH.html
201912080WAS.html
201701280GSW.html
201503240MIL.html
200704010DET.html
200702230MIN.html
201011060MIA.html
200502050DEN.html
201202020NYK.html
200002240POR.html
201012150DAL.html
200801120SAC.html
199701140PHO.html
201005090BOS.html
201411100CHI.html
200902110UTA.html
201701100WAS.html
199702270HOU.html
200002280GSW.html
201312080OKC.html
200212130LAL.html
199904180VAN.html
199711020BOS.html
201304300DEN.html
201712150ORL.html
200112220PHO.html
200711150D


