In [122]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import _pickle as cPickle
from os.path import exists
import random
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [123]:
def setup_driver(time_out=4):
    options = Options()
    options.headless = False
    
    chrome_options = webdriver.ChromeOptions()

    ### This blocks images and javascript requests
    chrome_prefs = {
        "profile.default_content_setting_values": {
            "images": 2,
        }
    }
    options.add_argument("--window-size=100,100")
    chrome_options.experimental_options["prefs"] = chrome_prefs

    driver = webdriver.Chrome(service=Service(ChromeDriverManager(
    ).install()), options=options, chrome_options=chrome_options)
    driver.set_page_load_timeout(time_out)
    return driver

In [127]:
# get teams abbreviations
with open('./TeamAbbrDict.json', 'r') as f:
    abbreviations = json.loads(f.read())


# init all year dict
years = [2017, 2018, 2019, 2020, 2021, 2022]
df_by_year_abbr_dict = dict()
for year in years:
    df_by_year_abbr_dict[year] = dict()


In [None]:
for year in years:
    # get urls to scrap
    urls = [f"https://www.baseball-reference.com/teams/{abbr}/{year}.shtml" for abbr in abbreviations.values()]

    for i,( abbr, url) in enumerate(zip(abbreviations.values(), urls)):
        # skip already scraped
        if abbr in df_by_year_abbr_dict[year]:
            continue
        
        driver = setup_driver(time_out=3)
        try:
            print(f"[{i+1}/30] scraping {abbr} data..")
            driver.get(url)
        except TimeoutException:
            print("load page timeout")
            pass

        # find the table
        body = driver.find_elements(By.XPATH, "//div[@id='div_appearances']//tbody//tr")
        # get each row
        data = []
        for row in (body):
            player_name = row.find_element(By.TAG_NAME, "th").text
            player_salary = row.find_element(By.CSS_SELECTOR, "td[data-stat='Salary'").text
            player_allstar_appearance = row.find_element(By.CSS_SELECTOR, "td[data-stat='allstar_appearance'").text
            data.append([player_name, player_salary, player_allstar_appearance])
        # write to df
        data = np.array(data)
        df = pd.DataFrame(data=data, columns=['Name', 'Salary', 'AllStartAppearance'])
        df = df[(df['Salary'] != '') | (df['AllStartAppearance'] != '')]
        df_by_year_abbr_dict[year][abbr] = df
        print(f"scrap success {abbr}")

In [176]:
# save df
with open(f"player-salaries.pickle", "wb") as output_file:
    cPickle.dump(df_by_year_abbr_dict, output_file)

In [None]:
# load df
with open(f"player-salaries.pickle", "rb") as in_file:
    data = cPickle.load(in_file)

# access data by year by team abbreviation, abbreviation can be found in TeamAbbrDict.json
data[2022]['BOS']