In [1]:
import requests
import pandas as pd
import datetime
import io
from bs4 import BeautifulSoup

In [6]:
def sanitize_input(start_dt,end_dt):
    # if no dates are supplied, assume they want yesterday's data
    # send a warning in case they wanted to specify
    if start_dt is None and end_dt is None:
        today = datetime.datetime.today()
        start_dt = (today - datetime.timedelta(1)).strftime("%Y-%m-%d")
        end_dt = today.strftime("%Y-%m-%d")
        print("Warning: no date range supplied. Returning yesterday's data. For a different date range, try batting_stats_range(start_dt, end_dt) or batting_stats(season).")
    #if only one date is supplied, assume they only want that day's stats
    #query in this case is from date 1 to date 1
    if start_dt is None:
        start_dt = end_dt
    if end_dt is None:
        end_dt = start_dt
    #if end date occurs before start date, swap them
    if end_dt < start_dt:
        temp = start_dt
        start_dt = end_dt
        end_dt = temp

    # now that both dates are not None, make sure they are valid date strings
    validate_datestring(start_dt)
    validate_datestring(end_dt)
    return start_dt, end_dt


In [7]:
def get_soup(start_dt, end_dt):
    # get most recent standings if date not specified
    # if((start_dt is None) or (end_dt is None)):
    #    print('Error: a date range needs to be specified')
    #    return None
    url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=b&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(start_dt, end_dt)
    s = requests.get(url).content
    return BeautifulSoup(s, "html.parser")


In [8]:
def get_table(soup):
    table = soup.find_all('table')[0]
    data = []
    headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
    data.append(headings)
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols])
    data = pd.DataFrame(data)
    data = data.rename(columns=data.iloc[0])
    data = data.reindex(data.index.drop(0))
    return data



In [9]:
def batting_stats_range(start_dt=None, end_dt=None):
    """
    Get all batting stats for a set time range. This can be the past week, the
    month of August, anything. Just supply the start and end date in YYYY-MM-DD
    format.
    """
    # make sure date inputs are valid
    start_dt, end_dt = sanitize_input(start_dt, end_dt)
    if datetime.datetime.strptime(start_dt, "%Y-%m-%d").year < 2008:
        raise ValueError("Year must be 2008 or later")
    if datetime.datetime.strptime(end_dt, "%Y-%m-%d").year < 2008:
        raise ValueError("Year must be 2008 or later")
    # retrieve html from baseball reference
    soup = get_soup(start_dt, end_dt)
    table = get_table(soup)
    table = table.dropna(how='all')  # drop if all columns are NA
    # scraped data is initially in string format.
    # convert the necessary columns to numeric.
    for column in ['Age', '#days', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
                    'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP',
                    'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS']:
        #table[column] = table[column].astype('float')
        table[column] = pd.to_numeric(table[column])
        #table['column'] = table['column'].convert_objects(convert_numeric=True)
    table = table.drop('', 1)
    return table


In [10]:
def bwar_bat(return_all=False):
    """
    Get data from war_daily_bat table. Returns WAR, its components, and a few other useful stats. 
    To get all fields from this table, supply argument return_all=True.  
    """
    url = "http://www.baseball-reference.com/data/war_daily_bat.txt"
    s = requests.get(url).content
    c=pd.read_csv(io.StringIO(s.decode('utf-8')))
    if return_all:
        return c
    else:
        cols_to_keep = ['player_ID','WAR']
        return c[cols_to_keep]


In [14]:
WAR=bwar_bat(return_all=False)

In [16]:
WAR.head()

Unnamed: 0,player_ID,WAR
0,aardsda01,0.01
1,aardsda01,-0.02
2,aardsda01,0.01
3,aardsda01,0.0
4,aardsda01,0.03


In [18]:
War_df=WAR.groupby('player_ID').sum()

In [19]:
War_df

Unnamed: 0_level_0,WAR
player_ID,Unnamed: 1_level_1
aardsda01,0.10
aaronha01,142.57
aaronto01,-2.79
aasedo01,0.23
abadan01,-0.36
abadfe01,0.02
abadijo01,-0.06
abbated01,8.59
abbeybe01,-1.01
abbeych01,1.78


In [20]:
War_df.to_csv('war.csv')