In [1]:
x=1

In [4]:
y = 2

In [13]:
pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.4.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl.metadata (8.7 kB)
Collecting Deprecated (from pygithub>=1.51->pybaseball)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading PyGithub-2.4.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.6/362.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.9

In [21]:
import io
from datetime import date
from typing import Optional

import pandas as pd
from bs4 import BeautifulSoup

from pybaseball import cache
from pybaseball.utils import most_recent_season, sanitize_date_range
from pybaseball.datasources.bref import BRefSession

session = BRefSession()


def get_soup(start_dt: date, end_dt: date) -> BeautifulSoup:
    # get most recent standings if date not specified
    # if((start_dt is None) or (end_dt is None)):
    #    print('Error: a date range needs to be specified')
    #    return None
    url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=b&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(start_dt, end_dt)
    s = session.get(url).content
    # a workaround to avoid beautiful soup applying the wrong encoding
    s = s.decode('utf-8')
    return BeautifulSoup(s, features="lxml")


def get_table(soup: BeautifulSoup) -> pd.DataFrame:
    table = soup.find_all('table')[0]
    data = []
    headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
    headings.append("mlbID")
    data.append(headings)
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        row_anchor = row.find("a")
        mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA  # ID str or nan
        cols = [ele.text.strip() for ele in cols]
        cols.append(mlbid)
        data.append([ele for ele in cols])
    df = pd.DataFrame(data)
    df = df.rename(columns=df.iloc[0])
    df = df.reindex(df.index.drop(0))
    return df


def batting_stats_range(start_dt: Optional[str] = None, end_dt: Optional[str] = None) -> pd.DataFrame:
    """
    Get all batting stats for a set time range. This can be the past week, the
    month of August, anything. Just supply the start and end date in YYYY-MM-DD
    format.
    """
    # make sure date inputs are valid
    start_dt_date, end_dt_date = sanitize_date_range(start_dt, end_dt)
    if start_dt_date.year < 2008:
        raise ValueError("Year must be 2008 or later")
    if end_dt_date.year < 2008:
        raise ValueError("Year must be 2008 or later")
    # retrieve html from baseball reference
    soup = get_soup(start_dt_date, end_dt_date)
    table = get_table(soup)
    table = table.dropna(how='all')  # drop if all columns are NA
    # scraped data is initially in string format.
    # convert the necessary columns to numeric.
    for column in ['Age', '#days', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
                    'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP',
                    'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS', 'mlbID']:
        #table[column] = table[column].astype('float')
        table[column] = pd.to_numeric(table[column])
        #table['column'] = table['column'].convert_objects(convert_numeric=True)
    table = table.drop('', axis=1)
    return table


@cache.df_cache()
def batting_stats_bref(season: Optional[int] = None) -> pd.DataFrame:
    """
    Get all batting stats for a set season. If no argument is supplied, gives
    stats for current season to date.
    """
    if season is None:
        season = most_recent_season()
    start_dt = f'{season}-03-01' #opening day is always late march or early april
    end_dt = f'{season}-11-30' #postseason is definitely over by end of November
    return batting_stats_range(start_dt, end_dt)


@cache.df_cache()
def bwar_bat(return_all: bool = False) -> pd.DataFrame:
    """
    Get data from war_daily_bat table. Returns WAR, its components, and a few other useful stats.
    To get all fields from this table, supply argument return_all=True.
    """
    url = "http://www.baseball-reference.com/data/war_daily_bat.txt"
    s = session.get(url).content
    c=pd.read_csv(io.StringIO(s.decode('utf-8')))
    if return_all:
        return c
    else:
        cols_to_keep = ['name_common', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID', 'stint_ID', 'lg_ID',
                        'pitcher','G', 'PA', 'salary', 'runs_above_avg', 'runs_above_avg_off','runs_above_avg_def',
                        'WAR_rep','WAA','WAR']
        return c[cols_to_keep]

In [31]:
df1 = bwar_bat()

In [33]:
df1

Unnamed: 0,name_common,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,pitcher,G,PA,salary,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAR_rep,WAA,WAR
0,David Aardsma,430911.0,aardsda01,2004,SFG,1,NL,Y,11,0.0,300000.0,0.0,0.0,0.0,0.00,0.00,0.00
1,David Aardsma,430911.0,aardsda01,2006,CHC,1,NL,Y,43,3.0,,-0.4,-0.4,0.0,0.00,-0.04,-0.04
2,David Aardsma,430911.0,aardsda01,2007,CHW,1,AL,Y,2,0.0,387500.0,0.0,0.0,0.0,0.00,0.00,0.00
3,David Aardsma,430911.0,aardsda01,2008,BOS,1,AL,Y,5,1.0,403250.0,-0.2,-0.2,0.0,0.00,-0.02,-0.02
4,David Aardsma,430911.0,aardsda01,2009,SEA,1,AL,Y,3,0.0,419000.0,0.0,0.0,0.0,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122385,Dutch Zwilling,124791.0,zwilldu01,1915,CHI,1,FL,N,150,635.0,,27.5,23.5,0.1,0.97,3.17,4.14
122386,Dutch Zwilling,124791.0,zwilldu01,1916,CHC,1,NL,N,35,59.0,3250.0,-6.7,-5.7,-1.3,0.19,-0.86,-0.67
122387,Tony Zych,543964.0,zychto01,2015,SEA,1,AL,Y,0,0.0,,0.0,0.0,0.0,,,
122388,Tony Zych,543964.0,zychto01,2016,SEA,1,AL,Y,0,0.0,511000.0,0.0,0.0,0.0,,,


In [55]:
high_sal = df1.query("salary >= 20000000")

In [85]:
high_sal_no_pitch = high_sal.query("pitcher == 'N'")

In [91]:
high_sal_no_pitch.sort_values("WAR").head(30)

Unnamed: 0,name_common,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,pitcher,G,PA,salary,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAR_rep,WAA,WAR
25753,Chris Davis,448801.0,davisch02,2018,BAL,1,AL,N,128,522.0,23000000.0,-50.4,-42.4,-15.6,1.79,-5.04,-3.25
88794,Albert Pujols,405395.0,pujolal01,2017,LAA,1,AL,N,149,636.0,26000000.0,-40.7,-39.7,-14.5,2.19,-4.11,-1.92
51504,Ryan Howard,429667.0,howarry01,2015,PHI,1,NL,N,129,503.0,25000000.0,-28.6,-15.6,-19.7,1.59,-3.19,-1.6
34074,Prince Fielder,425902.0,fieldpr01,2016,TEX,1,AL,N,89,370.0,24000000.0,-27.8,-24.8,-10.9,1.3,-2.87,-1.57
116686,Jayson Werth,150029.0,werthja01,2015,WSN,1,NL,N,88,378.0,21000000.0,-24.2,-12.2,-15.6,1.17,-2.69,-1.52
51503,Ryan Howard,429667.0,howarry01,2014,PHI,1,NL,N,153,648.0,25000000.0,-27.9,-15.9,-20.2,2.05,-3.39,-1.34
57810,Matt Kemp,461314.0,kempma01,2017,ATL,1,NL,N,115,467.0,21750000.0,-26.4,-10.4,-20.3,1.46,-2.77,-1.31
51505,Ryan Howard,429667.0,howarry01,2016,PHI,1,NL,N,112,362.0,25000000.0,-21.3,-16.3,-10.3,1.13,-2.31,-1.18
93745,Alex Rodriguez,121347.0,rodrial01,2016,NYY,1,AL,N,65,243.0,21000000.0,-19.7,-19.7,-5.2,0.86,-2.04,-1.18
51501,Ryan Howard,429667.0,howarry01,2012,PHI,1,NL,N,71,292.0,20000000.0,-19.0,-13.0,-9.6,0.89,-2.06,-1.17


In [95]:
batting_stats_range("2021-04-01", "2021-07-30")

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,mlbID
1,José Abreu,34,1146,Maj-AL,Chicago,97,414,356,51,87,...,0,7,16,1,0,0.244,0.333,0.455,0.788,547989
2,Ronald Acuña Jr.,23,1166,Maj-NL,Atlanta,82,360,297,72,84,...,0,5,0,17,6,0.283,0.394,0.596,0.990,660670
3,Willy Adames,25,1146,"Maj-AL,Maj-NL","Milwaukee,Tampa Bay",101,393,351,55,92,...,0,0,6,3,4,0.262,0.341,0.482,0.823,642715
4,Matt Adams,32,1152,Maj-NL,Colorado,22,40,36,3,6,...,0,0,1,0,0,0.167,0.250,0.194,0.444,571431
5,Riley Adams,25,1176,Maj-AL,Toronto,12,30,28,2,3,...,0,0,1,0,0,0.107,0.167,0.179,0.345,656180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,Ryan Zimmerman,36,1147,Maj-NL,Washington,72,190,180,20,42,...,0,0,3,0,0,0.233,0.274,0.461,0.735,475582
954,Bruce Zimmermann,26,1215,Maj-AL,Baltimore,2,4,4,0,0,...,0,0,0,0,0,0.000,0.000,0.000,0.000,669145
955,Jordan Zimmermann,35,1235,Maj-NL,Milwaukee,1,1,1,0,0,...,0,0,0,0,0,0.000,0.000,0.000,0.000,519455
956,Tyler Zuber,26,1239,Maj-AL,Kansas City,1,1,1,0,0,...,0,0,0,0,0,0.000,0.000,0.000,0.000,676604
