In [1]:
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import requests
from tqdm import tqdm

In [2]:
def fetch_game_data(html_path, output_path, beginAt=1):
    with open(html_path) as f:
        soup = BeautifulSoup(f, 'html.parser')
    data = {'id':[],
            'data_text':[],
            'shot_type':[],
            'shot_result':[],
            'home_team':[],
            'visiting_team':[],
            'game_clock':[],
            'quarter':[],
            'video_url':[]
            }
    articles = soup.findAll('article', attrs={'class':'GamePlayByPlayRow_article__asoO2'})
    quarter = 1
    prev_game_time, game_time = None, None
    if beginAt > 1:
        df = pd.read_csv(output_path)
    with tqdm(total=len(articles), desc="Progress") as pbar:
        for idx, article in enumerate(articles):
            clock_span = article.find('span', attrs={'class':'GamePlayByPlayRow_clockElement__LfzHV'})
            if ':' in clock_span.text:
                m, s = clock_span.text.split(':')
                game_time = 60*int(m) + int(s)
            else:
                game_time = int(float(clock_span.text))
            if prev_game_time is not None and game_time > prev_game_time:
                quarter += 1
            prev_game_time = game_time

            if idx < beginAt-1:
                pbar.update(1)
                continue

            score_span = article.find('span', attrs={'class':'GamePlayByPlayRow_scoring__Ax2hd'})
            shot_result = 0
            v_team, h_team = '', ''
            if score_span:
                shot_result = 1
                scores = score_span.text.split('-')
                v_team = scores[0].strip()
                h_team = scores[1].strip()

            desc_block = article.find('div', attrs={'class':'GamePlayByPlayRow_descBlock__By8pv'})
            shot_type = ""
            if 'data-text' in desc_block.attrs:
                data_text = desc_block.attrs['data-text']
                data_text_lower = data_text.lower()
                if '3pt' in data_text_lower and 'jump shot' in data_text_lower:
                    shot_type = "Three Point Jumper"
                elif 'free throw' in data_text_lower:
                    shot_type = "Free Throw"
                elif 'tip' in data_text_lower and 'shot' in data_text_lower:
                    shot_type = "Two Point Tip Shot"
                elif 'layup' in data_text_lower:
                    shot_type = "Layup"
                elif 'hook shot' in data_text_lower:
                    shot_type = "Hook"
                elif 'dunk' in data_text_lower:
                    shot_type = "Dunk"
                elif 'jump shot' in data_text_lower or 'jumper' in data_text_lower:
                    shot_type = "Jumper"
            
            a_tag = desc_block.find('a')
            if a_tag is None:
                pbar.update(1)
                continue
            vid_page_link = "https://www.nba.com{}".format(a_tag.attrs['href'])
            search_param = vid_page_link.split('/')[-1].split('&')
            game_event_id, game_id = None, None
            for param in search_param:
                if 'GameEventID' in param:
                    game_event_id = param.split('=')[-1]
                if 'GameID' in param:
                    game_id = param.split('=')[-1]
            if not (game_event_id or game_id):
                pbar.update(1)
                continue
            headers = {'Accept': '*/*',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
                        'Connection': 'keep-alive',
                        'Host': 'stats.nba.com',
                        'Origin': 'https://www.nba.com',
                        'Referer': 'https://www.nba.com/',
                        'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
                        'sec-ch-ua-mobile': '?0',
                        'sec-ch-ua-platform': "Linux",
                        'Sec-Fetch-Dest': 'empty',
                        'Sec-Fetch-Mode': 'cors',
                        'Sec-Fetch-Site': 'same-site',
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
                    }
            r = requests.get("https://stats.nba.com/stats/videoeventsasset?GameEventID={}&GameID={}".format(game_event_id, game_id), headers=headers)
            video_url = json.loads(r.content)['resultSets']['Meta']['videoUrls'][0]['lurl']
            
            data['id'] = [idx+1]
            data['data_text'] = [data_text]
            data['shot_type'] = [shot_type]
            data['shot_result'] = [shot_result]
            data['home_team'] = [h_team]
            data['visiting_team'] = [v_team]
            data['game_clock'] = [game_time]
            data['quarter'] = [quarter]
            data['video_url'] = [video_url]

            df = pd.DataFrame(data)
            if os.path.exists(output_path):
                df.to_csv(output_path, mode='a', header=False, index=False)
            else:
                df.to_csv(output_path, mode='a', index=False)
            pbar.update(1)
    return df

In [3]:
path = 'game_page.html'
output_path = os.path.join('15nov_1st.csv')
fetch_game_data(path, output_path)

Progress:  43%|█████████████▎                 | 203/471 [03:14<04:16,  1.04it/s]


KeyboardInterrupt: 