In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException



In [87]:
def GetMetaGameInfo(html):
    soup = BeautifulSoup(html, "html.parser")
    scorebox = soup.find("div", {"class": "scorebox"})
    t1, t2, score_box_meta = (list(scorebox.children)[i] for i in [1,3,5])
    for t in [t1, t2]:
        t.name = t.find("strong").text.strip(" ").strip("\n").replace(" ", "")
        t.score = int(t.find("div", {"class": "score"}).text)
        t.pre_win, t.pre_loss = (int(i) for i in list(t.children)[4].text.split('-'))
    if t1.score > t2.score:
        t1.pre_win -= 1
        t2.pre_loss -= 1
    else:
        t2.pre_win -= 1
        t1.pre_loss -= 1
    date, start_time, att, venue, duration, at_night_on_grass = list(list(score_box_meta.children)[i].text for i in [1, 2, 3, 4, 5, 6] )
    date = datetime.strptime(date, "%A, %B %d, %Y")
    start_time = datetime.strptime(start_time[12:20].replace(".", ""), "%I:%M %p")
    start_time = date + timedelta(hours=start_time.hour, minutes=start_time.minute)
    duration = datetime.strptime(duration[15:], "%H:%M")
    duration = timedelta(hours=duration.hour, minutes=duration.minute)
    at_night, on_grass = at_night_on_grass.split(", ")
    at_night = 'Night Game' == at_night
    on_grass = 'on grass' == on_grass
    att = int(att.split(": ")[1].replace(",", ""))
    venue = venue.split(": ")[1]    
    gameInfo = {
        "start_time": start_time, "duration": duration,
        "venue": venue, "at_night": at_night, "on_grass": on_grass,
        "att": att
    }
    return t1, t2, gameInfo

# soup = BeautifulSoup(, "html.parser")
def GetTable(teamBatting):
    tbody = teamBatting.find_element(By.TAG_NAME, "tbody")
    table = [row.split(" ") for row in tbody.text.split("\n")]
    iter_table = table[:]
    table = []
    for row in iter_table:
        for i, ele in enumerate(row[:4]):
            if ele.isnumeric():
                name = " ".join(row[:i])
                row = [name] + row[i:]
                table.append(row)
                break
    return pd.DataFrame(table, columns=[
        'Batting',
        'AB',
        'R',
        'H',
        'RBI',
        'BB',
        'SO',
        'PA',
        'BA',
        'OBP',
        'SLG',
        'OPS',
        'Pit',
        'Str',
        'WPA',
        'aLI',
        'WPA+',
        'WPA-',
        'cWPA',
        'acLI',
        'RE24',
        'PO',
        'A',
        'Details'
    ])

In [94]:

def scratch(url):
    ### This blocks images
    chrome_options = webdriver.ChromeOptions()
    chrome_prefs = {
        "profile.default_content_setting_values": {
            "images": 2,
            # "javascript": 2,
        }
    }
    chrome_options.experimental_options["prefs"] = chrome_prefs

    options = Options()
    options.headless = False

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options, chrome_options=chrome_options)
    driver.set_page_load_timeout(5)

    while True:
        # load page for 4 sec
        try:
            driver.get(url)
        except TimeoutException:
            print("load page timeout")
        
        # check if batting table is load
        try:
            scorebox = WebDriverWait(driver, 1).until(EC.presence_of_element_located(
                    (By.CLASS_NAME, "scorebox")
                ))
            team1, team2, metaGameInfo = GetMetaGameInfo(driver.page_source)
            t1batting = WebDriverWait(driver, 1).until(EC.presence_of_element_located(
                (By.ID, f"all_{team1.name}batting")
                ))
            df1 = GetTable(t1batting)
            print("got team1")
            t2batting = WebDriverWait(driver, 1).until(EC.presence_of_element_located(
                (By.ID, f"all_{team2.name}batting")
                ))
            df2 = GetTable(t1batting)
            print("got team2")
        except TimeoutException:
            continue
        break
    team1 = {
        "name": team1.name,
        "pre_win": team1.pre_win,
        "pre_loss": team1.pre_loss,
        "player_df": df1
    }
    team2 = {
        "name": team2.name,
        "pre_win": team2.pre_win,
        "pre_loss": team2.pre_loss,
        "player_df": df2
    }
    
    gameInfo = {
        "meta_game_info": metaGameInfo,
        "team1": team1,
        "team2": team2
    }
    
    return gameInfo

In [104]:
url = "https://www.baseball-reference.com/boxes/ANA/ANA202104010.shtml"
url = "https://www.baseball-reference.com/boxes/BOS/BOS202106270.shtml"
gameInfo = scratch(url)

  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options, chrome_options=chrome_options)


load page timeout
got team1
got team2


{'name': 'NewYorkYankees',
 'pre_win': 40,
 'pre_loss': 36,
 'player_df':                 Batting AB  R     H   RBI    BB    SO    PA    BA   OBP  ...  \
 0        DJ LeMahieu 2B  4  1     2     0     0     1     4  .274  .346  ...   
 1        Aaron Judge CF  4  1     1     2     0     1     4  .285  .381  ...   
 2          Luke Voit 1B  4  0     1     0     0     2     4  .217  .289  ...   
 3  Giancarlo Stanton DH  4  0     0     0     0     3     4  .268  .358  ...   
 4        Gio Urshela 3B  4  0     0     0     0     1     4  .268  .310  ...   
 5     Gleyber Torres SS  4  0     1     0     0     2     4  .242  .324  ...   
 6     Miguel Andujar LF  3  0     0     0     1     0     4  .248  .279  ...   
 7      Clint Frazier RF  3  0     1     0     1     1     4  .187  .315  ...   
 8     Kyle Higashioka C  2  0     1     0     0     1     2  .198  .277  ...   
 9         Gerrit Cole P  1  0  None  None  None  None  None  None  None  ...   
 
       WPA   aLI   WPA+     WPA-  

In [44]:
s = "Batting	AB	R	H	RBI	BB	SO	PA	BA	OBP	SLG	OPS	Pit	Str	WPA	aLI	WPA+	WPA-	cWPA	acLI	RE24	PO	A	Details"
s.split('\t')

['Batting',
 'AB',
 'R',
 'H',
 'RBI',
 'BB',
 'SO',
 'PA',
 'BA',
 'OBP',
 'SLG',
 'OPS',
 'Pit',
 'Str',
 'WPA',
 'aLI',
 'WPA+',
 'WPA-',
 'cWPA',
 'acLI',
 'RE24',
 'PO',
 'A',
 'Details']

: 