# Daily Webscraping
This notebook is dedicated to scraping daily game data for a specific day of the year.

In [1]:
import selenium.common.exceptions
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import date
from io import StringIO
from datetime import datetime
import time
import pandas as pd

In [2]:
date_to_scrape = datetime.strptime(input('Input mm-dd-yyyy to scrape: '), '%m-%d-%Y').date()
day = date_to_scrape.day
month = date_to_scrape.month
year = date_to_scrape.year

Input mm-dd-yyyy to scrape:  5-19-2024


In [3]:
basketball_reference = f'https://www.basketball-reference.com/boxscores/index.fcgi?month={month}&day={day}&year={year}'

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('detach', True)

driver = webdriver.Chrome(options=chrome_options)

In [5]:
def get_boxscores():
    box_scores = []
    
    driver.get(basketball_reference)
    elements = driver.find_elements(By.CSS_SELECTOR, '.links > a')
    
    for element in elements[::3]:
        link = element.get_attribute('href')
        box_scores.append(link)

    return box_scores
box_scores = get_boxscores()

In [6]:
def get_stats():
    games = []
    base_cols = None
    for box_score in box_scores:
        # Max 20 request per minute
        time.sleep(5)

        # basic & advanced team stats for specific game
        stats = []

        # Go to game box score link
        driver.get(box_score)

        # Get Game Date
        try: 
            header = driver.find_element(By.CSS_SELECTOR, 'H1').text
            game_day = day
            game_month = month
            game_year = year
            game_season = 2024
        except IndexError:
            # Handle host downtime with saving the progress to csv and try to continue after user approves
            input('There was an index error press enter to move on!')

            driver.get(box_score)

            time.sleep(3)
            header = driver.find_element(By.CSS_SELECTOR, 'H1').text
            game_day = day
            game_month = month
            game_year = year
            game_season = 2024
            

        # ===== GET LINE SCORES =====
        line_score_table = driver.find_element(By.ID, 'div_line_score').get_attribute('innerHTML')
        line_score_df = pd.read_html(StringIO(line_score_table))[0]

        # Adjust Dataframe
        line_score_df.columns = line_score_df.columns.droplevel()
        line_score_df = line_score_df.rename(columns={'Unnamed: 0_level_1': 'team', 'T': 'total'})
        line_score_df = line_score_df[['team', 'total']]

        # ===== GET BASIC & ADVANCED STATS =====
        teams = list(line_score_df['team'])
        #print(f'Gathering {teams[0]} vs {teams[1]} Data')
        for team in teams:
            advanced_id = f'div_box-{team}-game-advanced'
            basic_id = f'div_box-{team}-game-basic'
            
            # Find advanced stats table
            advanced_stats_table = driver.find_element(By.ID, advanced_id).get_attribute('innerHTML')
            advanced_stats_df = pd.read_html(StringIO(advanced_stats_table), index_col=0)[0]
            advanced_stats_df = advanced_stats_df.apply(pd.to_numeric, errors='coerce')
            advanced_stats_df.columns = advanced_stats_df.columns.droplevel()

            # Find basic stats table
            basic_stats_table = driver.find_element(By.ID, basic_id).get_attribute('innerHTML')
            basic_stats_df = pd.read_html(StringIO(basic_stats_table), index_col=0)[0]
            basic_stats_df = basic_stats_df.apply(pd.to_numeric, errors='coerce')
            basic_stats_df.columns = basic_stats_df.columns.droplevel()

            # Get total team stats for basic and advanced stats and concat.
            totals_df = pd.concat([basic_stats_df.iloc[-1, :], advanced_stats_df.iloc[-1, :]])
            totals_df.index = totals_df.index.str.lower()

            # Get Max scores for each stat & for each team (individual player)
            maxes_df = pd.concat([basic_stats_df.iloc[:-1, :].max(), advanced_stats_df.iloc[:-1, :].max()])
            maxes_df.index = maxes_df.index.str.lower() + '_max'

            stat = pd.concat([totals_df, maxes_df])

            if base_cols is None:
                base_cols = list(stat.index.drop_duplicates(keep='first'))
                base_cols = [b for b in base_cols if "bpm" not in b]

            stat = stat[base_cols]
            stats.append(stat)

        # Concat both stats
        stat_df = pd.concat(stats, axis=1).T

        # Create game df
        game = pd.concat([stat_df, line_score_df], axis=1)
        game['home'] = [0, 1]
        
        # Create Opponent columns
        game_opp = game.iloc[::-1].reset_index()
        game_opp.columns += '_opp'

        # Merge home + opponent columns
        full_game = pd.concat([game, game_opp], axis=1)

        full_game['season'] = game_season

        full_game['date'] = f'{game_year}-{game_month}-{game_day}'
        full_game['date'] = pd.to_datetime(full_game['date'])

        full_game['won'] = full_game['total'] > full_game['total_opp']

        # for every full game data we have 2 rows, from opponent teams perspective & from home teams perspective
        games.append(full_game)

    return games

In [7]:
games = get_stats()
full_df = pd.concat(games, axis=0)
full_df = full_df.reset_index()
full_df = full_df.drop('index', axis=1)
full_df.to_csv(f'Daily Game Data/{date_to_scrape}_nba_games.csv', index = False)

driver.quit()