In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import _pickle as cPickle
from os.path import exists
import random


In [2]:

def GetMetaGameInfo(html):
    soup = BeautifulSoup(html, "html.parser")
    scorebox = soup.find("div", {"class": "scorebox"})
    t1, t2, score_box_meta = (list(scorebox.children)[i] for i in [1,3,5])
    for t in [t1, t2]:
        t.name = t.find("strong").text.strip(" ").strip("\n").replace(" ", "").replace(".", "")
        t.score = int(t.find("div", {"class": "score"}).text)
        t.pre_win, t.pre_loss = (int(i) for i in list(t.children)[4].text.split('-'))
    if t1.score > t2.score:
        t1.pre_win -= 1
        t2.pre_loss -= 1
    else:
        t2.pre_win -= 1
        t1.pre_loss -= 1
    scorebox = soup.find("div", {"class": "scorebox"})
    t1, t2, score_box_meta = (list(scorebox.children)[i] for i in [1,3,5])
    rows = [i.text for i in list(score_box_meta.children) if i != '\n']
    date = rows[0]
    start_time = [row for row in rows if "Start Time:" in row][0]
    att = [row for row in rows if "Attendance:" in row]
    have_att = len(att) != 0
    att = -1 if have_att == False else att[0]
    venue = [row for row in rows if "Venue:" in row][0]
    duration = [row for row in rows if "Game Duration:" in row][0]
    at_night_on_grass = [row for row in rows if "on " in row][0]
    # date, start_time, att, venue, duration, at_night_on_grass = list(list(score_box_meta.children)[i].text for i in [1, 2, 3, 4, 5, 6] )
    at_night = None
    on_grass = None
    info_dict = dict()
    if have_att:
        date = datetime.strptime(date, "%A, %B %d, %Y")
        start_time = datetime.strptime("".join(start_time.split(" ")[2:4]).replace(".", ""), "%I:%M%p")
        start_time = date + timedelta(hours=start_time.hour, minutes=start_time.minute)
        duration = datetime.strptime(duration[15:], "%H:%M")
        duration = timedelta(hours=duration.hour, minutes=duration.minute)
        at_night, on_grass = at_night_on_grass.split(", ")
        at_night = 'Night Game' == at_night
        on_grass = 'on grass' == on_grass
        att = int(att.split(": ")[1].replace(",", ""))
        venue = venue.split(": ")[1]
        infos = list(soup.find('h2', text="Other Info").parent.parent.find("div", {"class" : "section_content"}).children)
        infos = [i for i in infos if '\n' != i]
        for info in infos:
            k,v = info.text.split(":", 1)
            info_dict[k] = v
    gameInfo = {
        "have_att": have_att,
        "start_time": start_time, "duration": duration,
        "venue": venue, "at_night": at_night, "on_grass": on_grass,
        'Start Time Weather': "" if 'Start Time Weather' not in info_dict else  info_dict['Start Time Weather'],
        'Umpires': "" if "Umpires" not in info_dict else info_dict['Umpires'],
        "att": att
    }
    return t1, t2, gameInfo

# soup = BeautifulSoup(, "html.parser")
def GetTable(html, table_id):
    soup = BeautifulSoup(html, "html.parser")
    tbody = soup.find("div", {"id": table_id}).find("tbody")
    table = []
    for tr in tbody.findAll("tr"):
        row = []
        name = tr.find("th").text
        row.append(name)
        for td in tr.findAll("td"):
            row.append(td.text)
        if not all([i == "" for i in row]):
            table.append(row)
    return pd.DataFrame(table, columns=[
        'Batting',
        'AB',
        'R',
        'H',
        'RBI',
        'BB',
        'SO',
        'PA',
        'BA',
        'OBP',
        'SLG',
        'OPS',
        'Pit',
        'Str',
        'WPA',
        'aLI',
        'WPA+',
        'WPA-',
        'cWPA',
        'acLI',
        'RE24',
        'PO',
        'A',
        'Details'
    ])
    
def setup_driver():
    options = Options()
    options.headless = False
    
    chrome_options = webdriver.ChromeOptions()

    ### This blocks images and javascript requests
    chrome_prefs = {
        "profile.default_content_setting_values": {
            "images": 2,
        }
    }
    chrome_options.experimental_options["prefs"] = chrome_prefs

    driver = webdriver.Chrome(service=Service(ChromeDriverManager(
    ).install()), options=options)#, chrome_options=chrome_options)
    driver.set_page_load_timeout(4)
    return driver

def scratch_meta_page(url):
    driver = setup_driver()
    
    while True:
        try:
            driver.get(url)
        except TimeoutException:
            print("load page timeout")

        try:
            WebDriverWait(driver, 1).until(EC.presence_of_element_located(
                    (By.CLASS_NAME, "game")))
        except TimeoutException:
            continue
        break
    print("get meta page data")
    return driver.page_source


def scratch_single_page(url):
    print(f"Scratching {url}")
    driver=setup_driver()

    # load page for 4 sec
    try:
        driver.get(url)
    except TimeoutException:
        print("load page timeout")

    team1, team2, metaGameInfo = GetMetaGameInfo(driver.page_source)
    df1=GetTable(driver.page_source, f"all_{team1.name}batting")
    df2=GetTable(driver.page_source, f"all_{team2.name}batting")
    print("get page data success")
    
    team1={
        "name": team1.name,
        "pre_win": team1.pre_win,
        "pre_loss": team1.pre_loss,
        "player_df": df1
    }
    team2={
        "name": team2.name,
        "pre_win": team2.pre_win,
        "pre_loss": team2.pre_loss,
        "player_df": df2
    }

    gameInfo={
        "meta_game_info": metaGameInfo,
        "team1": team1,
        "team2": team2
    }

    return gameInfo



In [3]:
def scrap_by_years(years):
    for year in years:
        print(f"get games from year: {year}")
        # load previous scratched games data
        if exists(f"gamesData{year}.pickle"):
            with open(f"gamesData{year}.pickle", "rb") as output_file:
                data = cPickle.load(output_file)
        else:
            data = dict()
        
        # get all year's game from schedule page
        html = scratch_meta_page(f"https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml")
        soup = BeautifulSoup(html, "html.parser")
        games = soup.findAll("p", {"class": "game"})
        print(f"have {len(games)} to scrap")
        # scratch the rest
        for game in games:
            for i in range(5):# max retry 5 time
                game_url = "https://www.baseball-reference.com" + game.find("em").find("a")['href']
                if game_url not in data:
                    #scratch game
                    try:
                        gameInfo = scratch_single_page(game_url)
                        # save to data
                        data[game_url] = gameInfo
                        with open(f"gamesData{year}.pickle", "wb") as output_file:
                            cPickle.dump(data, output_file)
                        break
                    except AttributeError:
                        print("something wrong, retry scrap this page")
                        continue
                else:
                    break
                    


In [4]:
# html = scratch_single_page("https://www.baseball-reference.com/boxes/CHA/CHA201704040.shtml")
# soup = BeautifulSoup(html, "html.parser")

In [5]:
# year to scrap
years = ["2022", "2021", "2019", "2018", "2017", "2016", "2015"]
y1 = ["2022", "2021"]
y2 = ["2019", "2018"]
y3 = ["2017", "2016", "2015"]

scrap_by_years(years)

get games from year: 2022
load page timeout
get meta page data
have 2470 to scrap
Scratching https://www.baseball-reference.com/boxes/DET/DET202208060.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/KCA/KCA202208060.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/LAN/LAN202208060.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/MIL/MIL202208060.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/MIN/MIN202208060.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/NYN/NYN202208061.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/NYN/NYN202208062.shtml
load page timeout
get page data success
Scratching https://www.baseball-reference.com/boxes/OAK/OAK202208060.shtml
load page timeout
get page data succes

KeyboardInterrupt: 

In [None]:
with open(f"gamesData{2022}.pickle", "rb") as output_file:
            data = cPickle.load(output_file)
len(data)

1602