## Updating script to include stats for Log5 and Polynomial Expectation

In [1]:
#Imports
import statsapi
import datetime 
import pandas as pd
import json
from functools import reduce
import numpy as np

In [2]:
#Getting start date and end date of a season
def season_start_end(year):
    season_info = statsapi.get("seasons", {"sportId": 1, "season": year})
    season_start = season_info['seasons'][0]['regularSeasonStartDate']
    season_end = season_info['seasons'][0]['regularSeasonEndDate']
    return [season_start, season_end]

In [3]:
#Function to sum values in two dictionaries
#used in hit and pitch stats functions

#Returns the sum of two dictionaries values

def add_dicts(d1, d2):
    d3 = dict(d1)
    d3.update(d2)
    try:
        for i, j in d1.items():

            for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
    except: pass
    return d3

In [4]:
#Function to return player stats - hitting, pitching, and fielding

#Inputs are the id, start date and end date and the season year

def player_stats(id,start_date, end_date, year):

    #Getting player stats - separately because the API switches their order when fetched at the same time 
    hitting_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[hitting],type=[byDateRange],startDate={start_date},endDate={end_date},season={year})"})
    pitching_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[pitching],type=[byDateRange],startDate={start_date},endDate={end_date},season={year})"})
    fielding_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[fielding],type=[byDateRange],startDate={start_date},endDate={end_date},season={year})"})

    #dict where I will add all the collected stats
    stat_dict = {}

    #Temporary storage for fielding dicts for all positions
    fielding_list = []

    #List to check if the stats for a fielding position have already been collected - as they are listed more than once
    pos_codes = []

    #Taking last entry of hitting stats, as that is the cumulative for the date range
    try:
        player_hitting_stats = hitting_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['hitting'] = player_hitting_stats
    except: pass
    
    #Taking last entry of pitching stats, as that is the cumulative for the date range
    try:
        player_pitching_stats = pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['pitching'] = player_pitching_stats
    except: pass

    #Summing cumulative fielding stats for each position played (Team changes now accounted for)
    try:
        for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
            if fielding_stats['people'][0]['stats'][0]['splits'][i]['sport']['abbreviation'] == 'All':
                fielding_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
                
        all_fielding = {}

        for i in range(len(fielding_list)):
            all_fielding = add_dicts(all_fielding, fielding_list[i])
        stat_dict['fielding'] = all_fielding
        
        
    except:pass
    return stat_dict

In [5]:
#reading json that is full of the team rosters for each game_id
#roster_groups will then be used in many functions

f = open('roster_by_group')
roster_groups = json.load(f)

In [80]:
#Function that returns the outcomes for each game, as a dict

def season_match_outcomes(season_start, season_end):
    dict_list = []
    for i in statsapi.schedule(start_date=season_start, end_date=season_end):
        match_dict = {}
        try:
            match_dict['game_id'] = i['game_id']
            match_dict['win'] = i['winning_team']
            match_dict['loss'] = i['losing_team']
            dict_list.append(match_dict)
        except: pass
    return dict_list

## Adding functions for Log5

In [81]:
#Function that calcualtes a team's win % up to a specific date

#Takes the team_id as an input, as well as the start and end date of the period

def team_record(team_id, start_date, end_date):
    results = season_match_outcomes(start_date, end_date)
    wins = 0
    losses = 0
    team_name = statsapi.lookup_team(team_id)[0]['name']
    for i in range(len(results)):
        if results[i]['win'] == team_name:
            wins+=1
        elif results[i]['loss'] == team_name:
            losses+=1
    return round(wins/(wins+losses),3)

In [114]:
def log_5(home_team_id, away_team_id, start_date, end_date):
    home_per = team_record(home_team_id, start_date, end_date)
    away_per = team_record(away_team_id, start_date, end_date)
    log5 = (home_per - (home_per * away_per))/(home_per + away_per - (2*home_per*away_per))
    return log5

In [99]:
#Function that calculates all hitting stats, as well as the pythagorean expectation for the home team win, for a given game_id

#Takes game_id as the input an and returns a df of the stats

def all_hit_stats_df(game_id, df_home, df_away):

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

             #End date - date of the game input
            end_date = roster_groups[i]['date']

            #All players for the game
            game = roster_groups[i]

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]

            #Id of home team
            home_id = roster_groups[i]['home_id']

            #Id of away team
            away_id = roster_groups[i]['away_id']

            #breaking loop at this game_id then calculating stats
            break
        
    #Getting start date for rolling 10 day stats - 9 days before the end_date
    rolling_start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Start date for 2021 season - for current stats
    start_date = season_start_end(year)[0]

    #List of stats being pulled from API
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances', 'runs']

    #Calculating polynomial expectation

    log5 = log_5(home_id, away_id, start_date, end_date)

    #Initializing home stats variables for static hitting stats
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0
    h_runs = 0

    #Initializing home stats variables for rolling hitting stats
    h_r_hits = 0
    h_r_baseOnBalls = 0
    h_r_atBats = 0
    h_r_hitByPitch = 0
    h_r_doubles = 0
    h_r_triples = 0
    h_r_homeRuns= 0
    h_r_plateAppearances = 0

    #Initializing away stats variables for static hitting stats
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0
    a_runs = 0

    #Initializing away stats variables for rolling hitting stats
    a_r_hits = 0
    a_r_baseOnBalls = 0
    a_r_atBats = 0
    a_r_hitByPitch = 0
    a_r_doubles = 0
    a_r_triples = 0
    a_r_homeRuns= 0
    a_r_plateAppearances = 0

    #Getting stats for each player on the home team
    for j in game['home_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the home team static stats together for the time period
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_atBats+= stat_list[2]
            h_hitByPitch+= stat_list[3]
            h_doubles+= stat_list[4]
            h_triples+= stat_list[5]
            h_homeRuns+= stat_list[6]
            h_plateAppearances+= stat_list[7]
            h_runs+= stat_list[8]
        except: pass

        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]
            
            #Adding the home team rolling stats together for the time period
            h_r_hits+= stat_list_10[0]
            h_r_baseOnBalls+= stat_list_10[1]
            h_r_atBats+= stat_list_10[2]
            h_r_hitByPitch+= stat_list_10[3]
            h_r_doubles+= stat_list_10[4]
            h_r_triples+= stat_list_10[5]
            h_r_homeRuns+= stat_list_10[6]
            h_r_plateAppearances+= stat_list_10[7]
        except: pass

    #Getting stats for each player on the away team
    for j in game['away_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team stats together for the time period
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_atBats+= stat_list[2]
            a_hitByPitch+= stat_list[3]
            a_doubles+= stat_list[4]
            a_triples+= stat_list[5]
            a_homeRuns+= stat_list[6]
            a_plateAppearances+= stat_list[7]
            a_runs+= stat_list[8]
        except: pass
        
        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team rolling stats together for the time period
            a_r_hits+= stat_list_10[0]
            a_r_baseOnBalls+= stat_list_10[1]
            a_r_atBats+= stat_list_10[2]
            a_r_hitByPitch+= stat_list_10[3]
            a_r_doubles+= stat_list_10[4]
            a_r_triples+= stat_list_10[5]
            a_r_homeRuns+= stat_list_10[6]
            a_r_plateAppearances+= stat_list_10[7]
        except: pass


    #appending stats to their respective dfs
    df_home.loc[len(df_home)] = [game_id,round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3),round((h_r_hits+h_r_baseOnBalls+h_r_hitByPitch)/h_r_plateAppearances,3),round((h_r_hits+h_r_doubles+h_r_triples*2+h_r_homeRuns*3)/h_r_atBats,3), log5, h_runs,0]
    df_away.loc[len(df_away)] = [game_id,round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3),round((a_r_hits+a_r_baseOnBalls+a_r_hitByPitch)/a_r_plateAppearances,3),round((a_r_hits+a_r_doubles+a_r_triples*2+a_r_homeRuns*3)/a_r_atBats,3), 0,0,(a_runs*-1)]
    #Returning the difference of home and away team stats
    return df_home.set_index('game_id').subtract(df_away.set_index('game_id'), fill_value =0).reset_index()

In [104]:
#Function takes the game_id and home/away team dfs as inputs
#returns a df of home team - away team stats

def all_pitch_stats_df(game_id, df_home, df_away):

    #List of pitching stats used in calculations
    pitching_stats = ['earnedRuns', 'inningsPitched', 'baseOnBalls', 'hits']

    #List of pitching stats being calculated
    team_pitch_stats = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp', 'whip_10']

    #List of stats for rolling whip average
    whip_stats_list = ['hits', 'inningsPitched', 'baseOnBalls']

    #Checking for the inputted game id in the list of dicts of games
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #storing game roster data
            game = roster_groups[i]

            #Date of the game
            end_date = roster_groups[i]['date']

            #Getting home starting pitcher
            h_sp = roster_groups[i]['home_sp']

            #Getting away starting pitcher
            a_sp = roster_groups[i]['away_sp']

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]
            break

    #Initializing home team static stats variables
    #h_runs is for bullpen earned runs allowed
    h_runs = 0
    h_inningsPitched = 0
    h_walks = 0
    h_hits = 0
    #This is runs allowed for all players (eanred and unearned)
    h_runs_allowed = 0

    #Initializing away team static stats variables
    a_runs = 0
    a_inningsPitched = 0
    a_walks = 0
    a_hits = 0
    #This is runs allowed for all players (eanred and unearned)
    a_runs_allowed = 0

    #Initializing home team rolling stats variables
    h_r_hits = 0
    h_r_inningsPitched = 0
    h_r_walks = 0

    #Initializing away team rolling stats variables
    a_r_hits = 0
    a_r_inningsPitched = 0
    a_r_walks = 0

    #Getting start date of the season
    start_date = season_start_end(year)[0]

    #Start date for the 10 day rolling whip
    rolling_start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')
     
    #Getting stats of all players in the bullpen for the home team this game
    for j in game['home_bullpen']:
        try:
            #Getting all home team stats
            stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            #Summing stats for each team (players) for each game
            h_runs+= stat_list[0]
            h_inningsPitched+= stat_list[1]
            h_walks+= stat_list[2]
            h_hits+= stat_list[3]
        except: pass
    for j in game['home_hitting']:
        try:
            #Getting all home team stats for 10 day whip
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['pitching'][i] for i in whip_stats_list]

            #Getting runs allowed  for all possible pitchers
            h_runs_allowed+= player_stats(j, start_date, end_date, year)['pitching']['runs'] 

            #Changing string to float
            stat_list_10[1] = float(stat_list_10[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.2:
                stat_list_10[1] = int(stat_list_10[1]) + 0.666
            elif round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.1:
                stat_list_10[1] = int(stat_list_10[1]) + 0.333

            #Summing stats for each team (players) for each game
            h_r_hits+= stat_list_10[0]
            h_r_inningsPitched+= stat_list_10[1]
            h_r_walks+= stat_list_10[2]

        except: pass
        
    #Getting stats for the starting pitcher
    stat_list = [player_stats(h_sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

    #Changing string to float
    stat_list[1] = float(stat_list[1])

    #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
    if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
        stat_list[1] = int(stat_list[1]) + 0.666
    elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
            stat_list[1] = int(stat_list[1]) + 0.333

    #Summing stats for starting pitcher
    h_sp_runs = stat_list[0]
    h_sp_inningsPitched = stat_list[1]
    h_sp_walks = stat_list[2]
    h_sp_hits = stat_list[3]

    for j in game['away_bullpen']:
        try:
            #Getting all away team stats
            stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            a_runs+= stat_list[0]
            a_inningsPitched+= stat_list[1]
            a_walks+= stat_list[2]
            a_hits+= stat_list[3]
        except: pass
    for j in game['away_hitting']:
        try:
            #Getting all away team stats for 10 day whip
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['pitching'][i] for i in whip_stats_list]

            #Getting runs allowed  for all possible pitchers
            a_runs_allowed+= player_stats(j, start_date, end_date, year)['pitching']['runs'] 

            #Changing string to float
            stat_list_10[1] = float(stat_list_10[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.2:
                stat_list_10[1] = int(stat_list_10[1]) + 0.666
            elif round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.1:
                stat_list_10[1] = int(stat_list_10[1]) + 0.333

            #Summing stats for each team (players) for each game
            a_r_hits+= stat_list_10[0]
            a_r_inningsPitched+= stat_list_10[1]
            a_r_walks+= stat_list_10[2]
        except: pass

    stat_list = [player_stats(a_sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

    #Changing string to float
    stat_list[1] = float(stat_list[1])

    #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
    if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
        stat_list[1] = int(stat_list[1]) + 0.666
    elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
        stat_list[1] = int(stat_list[1]) + 0.333

    a_sp_runs = stat_list[0]
    a_sp_inningsPitched = stat_list[1]
    a_sp_walks = stat_list[2]
    a_sp_hits = stat_list[3]


    #appending stats to the df
    df_home.loc[len(df_home)] = [game_id,round(h_sp_runs*9/h_sp_inningsPitched,3),round((h_sp_hits+h_sp_walks)/h_sp_inningsPitched,3),round(h_runs*9/h_inningsPitched,3),round((h_hits+h_walks)/h_inningsPitched,3),round((h_r_hits+h_r_walks)/h_r_inningsPitched,3),h_runs_allowed,0]
    df_away.loc[len(df_away)] = [game_id,round(a_sp_runs*9/a_sp_inningsPitched,3),round((a_sp_hits+a_sp_walks)/a_sp_inningsPitched,3),round(a_runs*9/a_inningsPitched,3),round((a_hits+a_walks)/a_inningsPitched,3),round((a_r_hits+a_r_walks)/a_r_inningsPitched,3),0,(a_runs_allowed*-1)]
  
    return df_home.set_index('game_id').subtract(df_away.set_index('game_id'), fill_value =0).reset_index()

In [85]:
def home_win(game_id,df):
    game_results = statsapi.schedule(game_id = game_id)
    if game_results[0]['home_name'] == game_results[0]['winning_team']:
        df.loc[len(df)] = [game_id, 1]
    elif game_results[0]['status'] == 'Postponed':
        pass
    else: df.loc[len(df)] = [game_id, 0]
    return df

In [111]:
#Could remove year as input, but keeping for now

#Inputs are game id and 4 total empty dfs for home and away hitting and pitching

def all_team_stats(game_id, df_h_h, df_a_h, df_h_p, df_a_p, df_home_win):

    #Hitting stats df
    df_h = all_hit_stats_df(game_id, df_h_h, df_a_h)
    
    #Pitching stats df
    df_p = all_pitch_stats_df(game_id, df_h_p, df_a_p)

    #df of win/loss outcome for home team
    df_h_w = home_win(game_id, df_home_win)

    #Calculating PE_diff for home and away teams
    PE_home = (df_h['h_runs']**(1.83))/((df_h['h_runs']**(1.83)+ df_p['h_runs_allowed']**(1.83)))
    PE_away = (df_h['a_runs']**(1.83))/((df_h['a_runs']**(1.83)+ df_p['a_runs_allowed']**(1.83)))
    PE_diff = PE_home - PE_away

    #Dropping runs and runs allowed columns
    df_h.drop(columns = ['h_runs', 'a_runs'], axis=1, inplace=True)
    df_p.drop(columns = ['h_runs_allowed', 'a_runs_allowed'], axis=1, inplace=True)

    #Creating PE column
    df_h['PE'] = PE_diff

    #define list of DataFrames
    dfs = [df_h, df_p, df_h_w]

    #merge all DataFrames into one
    final_df = reduce(lambda left,right: pd.merge(left,right,on=['game_id'],
                                            how='outer'), dfs)
    return final_df

In [112]:
#Creating empty dfs for home and away teams with specified column names

#Name of columns
team_hit_stats = ['game_id','avg', 'obp', 'slug', 'obp_10', 'slug_10', 'log_5', 'h_runs', 'a_runs']

#dfs
df_h_h = pd.DataFrame(columns = team_hit_stats)
df_a_h = pd.DataFrame(columns = team_hit_stats)
df_h_p = pd.DataFrame(columns = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp', 'whip_10', 'h_runs_allowed', 'a_runs_allowed'])
df_a_p = pd.DataFrame(columns = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp', 'whip_10', 'h_runs_allowed', 'a_runs_allowed'])
df_home_win = pd.DataFrame(columns = ['game_id','home_win'])

In [113]:
#Testing function with log5 results
all_team_stats(634224, df_h_h,df_a_h,df_h_p,df_a_p,df_home_win)

Unnamed: 0,game_id,avg,obp,slug,obp_10,slug_10,log_5,PE,era_sp,whip_sp,era_bp,whip_bp,whip_10,home_win
0,634224.0,-0.021,0.013,-0.023,0.079,0.101,0.5,-0.058308,-1.97,-0.447,-0.7,-0.033,-0.262,0


In [13]:
#Generating
#game_list = [roster_groups[i]['game_id'] for i in range(7,10)]
#for i in game_list:
#    new_df = all_team_stats(i, df_h_h, df_a_h, df_h_p, df_a_p)
#new_df